In [1]:
#PREPROCESS STEP

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# USING FUNCTION TO HANDLE_OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

#FEATURE 

df=df[['Product Name','Price','Brand','Category','Description','Rating','Review Count','Style Attributes','Total Sizes','Available Sizes','Color','Purchase History','Age','Fashion Magazines','Fashion Influencers','Season','Time Period Highest Purchase','Customer Reviews','Social Media Comments','feedback'
      ]]

# IDENTIFYING MISSINFG VALUES

print(df.isnull().sum())


# HANDLE MISSING VALUES
df.fillna(value=np.nan, inplace=True)

# DROP MISSING ROWS
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# SELECT CATEGORICAL COLUMN 
categorical_cols = ['Brand', 'Category', 'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color', 'Purchase History',
                    'Fashion Magazines', 'Fashion Influencers', 'Season', 'Time Period Highest Purchase']

df_categorical = df[categorical_cols]

# CATEGORICAL COLUMN TO NUMERIC USING ONE HOT ENCODING
df_encoded = pd.get_dummies(df_categorical)

# EXTRACT NUMERIC COLUMN 
numerical_cols = ['Age', 'Rating']

df_numerical = df[numerical_cols]

# CONVERT STRING VALUES TO FLOAT
df_numerical['Age'] = df_numerical['Age'].astype(float)

# COMBINE THE  CATEGORICAL FEATURE AND NUMERICAL FEATURE 
df_preprocessed = pd.concat([df_encoded, df_numerical], axis=1)

# NORMALIZE AND SCALING ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
numerical_cols = df.select_dtypes(include='number').columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


scaler = StandardScaler()
numerical_cols = df.select_dtypes(include='number').columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


print(df_preprocessed)

Product Name                    0
Price                           0
Brand                           0
Category                        0
Description                     0
Rating                          0
Review Count                    0
Style Attributes                0
Total Sizes                     0
Available Sizes                 0
Color                           0
Purchase History                0
Age                             0
Fashion Magazines               0
Fashion Influencers             0
Season                          0
Time Period Highest Purchase    0
Customer Reviews                0
Social Media Comments           0
feedback                        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Age'] = df_numerical['Age'].astype(float)


        Brand_Alexander McQueen  Brand_Burberry  Brand_Calvin Klein  \
0                             0               0                   0   
1                             0               0                   0   
2                             0               0                   0   
3                             1               0                   0   
4                             0               0                   0   
...                         ...             ...                 ...   
999995                        0               0                   0   
999996                        0               0                   0   
999997                        0               0                   0   
999998                        0               0                   0   
999999                        0               1                   0   

        Brand_Jigsaw  Brand_Mulberry  Brand_Ralph Lauren  Brand_Ted Baker  \
0                  0               0                   1              

In [2]:
# preprocess step using NLTK FOR text data....


import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISSING VALUES 
df.dropna(inplace=True)

# REMOVE DUPLICATE ROWS
df.drop_duplicates(inplace=True)

# REMOVE ANY URLS
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

# REMOVE SPECIAL CHARACTERS
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

# CONVERT ALL TEXT TO LOWERCASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())
df['Brand'] = df['Brand'].str.lower()
df['Category'] = df['Category'].str.lower()

# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))


# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))


# USING GROUPBY METHOD FOR TOP PRODUCTS
top_products = df.groupby('Product Name').size().nlargest(50).index.tolist()

# CREATE DATAFRAME ONLY FOR THE TOP PRODUCTS
df_top_products = df[df['Product Name'].isin(top_products)]

# SHUFFLE THE ROWS OF THE DATAFRAME
df_top_products = df_top_products.sample(frac=1).reset_index(drop=True)

print(df_top_products)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


     Product Name      Price         Brand   Category Description    Rating  \
0            r9x4  42.813271  calvin klein  accessori   Very Good  2.663849   
1            s8f1  30.256069      mulberri    jewelri    Very Bad  2.325738   
2            o2a6  54.230751  tommi hilfig     bottom   Very Good  1.440138   
3            g0v3  34.380568  ralph lauren    jewelri    Not Good  3.317355   
4            r9x4  63.095972  ralph lauren      dress    Not Good  3.381028   
...           ...        ...           ...        ...         ...       ...   
1479         a7i6  35.569663     ted baker   swimwear       Worst  3.125005   
1480         n0y3  46.426757  tommi hilfig        top        Good  3.504431   
1481         r3k6  74.321983        jigsaw   swimwear    Not Good  1.576507   
1482         i9h8  67.336867     ted baker    jewelri   Very Good  3.668600   
1483         c1s9  43.173677     ted baker     bottom         Bad  3.396107   

      Review Count Style Attributes Total Sizes Ava

In [1]:
#combines the preprocessing steps and feature selection using the top products, and then performs normalization and scaling 
#on the numerical features


import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

# FUNCTION TO HANDLE OUTLIER
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


# DATASET 
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# REMOVE ANY URLS
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

#REMOVE ANY SPECIAL CHERECHTER
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))


# CONVERT ALL TEXT TO LOWER CASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())
df['Brand'] = df['Brand'].str.lower()
df['Category'] = df['Category'].str.lower()


# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))


# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))


# GROUPBY METHOD FOR TOP PRODUCTS
top_products = df.groupby('Product Name').size().nlargest(50).index.tolist()

# DATAFRAME ONLY FOR TOP PRODUCTS
df_top_products = df[df['Product Name'].isin(top_products)]

# SUFFLE THE ROWS OF THE DATAFRAME
df_top_products = df_top_products.sample(frac=1).reset_index(drop=True)

# FEATURE SELECTION
df_top_products = df_top_products[['Product Name', 'Price', 'Brand', 'Category', 'Description', 'Rating', 'Review Count',
                                   'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color', 'Purchase History',
                                   'Age', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                                   'Time Period Highest Purchase', 'Customer Reviews', 'Social Media Comments', 'feedback']]

# IDENTIFYING MISSING VALUES
print(df_top_products.isnull().sum())

# HANDLE MISSING VALUES
df_top_products.fillna(value=np.nan, inplace=True)

# DROP MISSING ROWS
df_top_products.dropna(inplace=True)

# REMOVE DUPLICATES
df_top_products.drop_duplicates(inplace=True)

# SELECT CATEGORICAL COLUMN
categorical_cols = ['Brand', 'Category', 'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color',
                    'Purchase History', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                    'Time Period Highest Purchase']

df_categorical = df_top_products[categorical_cols]

# ONE HOT ENCODING
df_encoded = pd.get_dummies(df_categorical)

#NUMERICAL COLUMN
numerical_cols = ['Price', 'Rating', 'Review Count', 'Age']

df_numerical = df_top_products[numerical_cols]

# CONVERT STRING VALUES TO FLOAT
df_numerical['Price'] = df_numerical['Price'].astype(float)
df_numerical['Rating'] = df_numerical['Rating'].astype(float)
df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
df_numerical['Age'] = df_numerical['Age'].astype(float)

# COMBINE THE CATEGORICAL FEATURE AND NUMERICAL FEATURE
df_preprocessed = pd.concat([df_encoded, df_numerical], axis=1)

# NORMALIZE AND SCALE AND STANDARIZE ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
numerical_cols = df_numerical.columns
df_preprocessed[numerical_cols] = scaler.fit_transform(df_numerical)

scaler = StandardScaler()
df_preprocessed[numerical_cols] = scaler.fit_transform(df_preprocessed[numerical_cols])

print(df_preprocessed)
print(df_top_products)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Product Name                    0
Price                           0
Brand                           0
Category                        0
Description                     0
Rating                          0
Review Count                    0
Style Attributes                0
Total Sizes                     0
Available Sizes                 0
Color                           0
Purchase History                0
Age                             0
Fashion Magazines               0
Fashion Influencers             0
Season                          0
Time Period Highest Purchase    0
Customer Reviews                0
Social Media Comments           0
feedback                        0
dtype: int64
      Brand_alexand mcqueen  Brand_burberri  Brand_calvin klein  Brand_jigsaw  \
0                         0               0                   0             0   
1                         0               0                   1             0   
2                         0               1                   0 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Price'] = df_numerical['Price'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Rating'] = df_numerical['Rating'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
A value i