In [3]:
# USING XGBOOST MODEL TO PREDICT PRICE AND RATING IN2024 

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# FUNCTION HANDLE OUTLIER
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


#DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# REMOVE ANY URLS
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))

# REMOVE SPECIAL CHARECHTER
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# CONVERT ALL TEXT TO LOWER CASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())

#REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# USING GROUPBY METHOD FOR TOP PRODUCTS
top_products = df.groupby('Product Name').size().nlargest(50).index.tolist()

# DATAFRAME ONLY FOR TOP PRODUCTS
df_top_products = df[df['Product Name'].isin(top_products)]

# SHUFFLE THE ROWS OF THE DATAFRAME
df_top_products = df_top_products.sample(frac=1).reset_index(drop=True)

# FEATURE
df_top_products = df_top_products[['Product Name', 'Price', 'Brand', 'Category', 'Description', 'Rating', 'Review Count',
                                   'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color', 'Purchase History',
                                   'Age', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                                   'Time Period Highest Purchase', 'Customer Reviews', 'Social Media Comments', 'feedback']]

# IDENTIFYING MISSING VALUES
print(df_top_products.isnull().sum())

# HANDLE MISSING VALUES
df_top_products.fillna(value=np.nan, inplace=True)

# DROP MISSING ROWS
df_top_products.dropna(inplace=True)

# REMOVE DUPLICATES
df_top_products.drop_duplicates(inplace=True)

# SELECT CATEGORICAL COLUMN
categorical_cols = ['Brand', 'Category', 'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color',
                    'Purchase History', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                    'Time Period Highest Purchase']

df_categorical = df_top_products[categorical_cols]

#ONE HOT ENCODING CATEGORICAL TO NUMERIC
df_encoded = pd.get_dummies(df_categorical)

# EXTRAC NUMERIC COLUMN
numerical_cols = ['Price', 'Rating', 'Review Count', 'Age']

df_numerical = df_top_products[numerical_cols]

# CONVERT STRING VALUES TO FLOAT
df_numerical['Price'] = df_numerical['Price'].astype(float)
df_numerical['Rating'] = df_numerical['Rating'].astype(float)
df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
df_numerical['Age'] = df_numerical['Age'].astype(float)

# COMBINE THE CATEGORICAL FEATURE AND NUMERICAL FEATURE
df_preprocessed = pd.concat([df_encoded, df_numerical], axis=1)

# NORMALIZE AND SCALE AND STANDARIZE ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
numerical_cols = df_numerical.columns
df_preprocessed[numerical_cols] = scaler.fit_transform(df_numerical)

scaler = StandardScaler()
df_preprocessed[numerical_cols] = scaler.fit_transform(df_preprocessed[numerical_cols])

# SPLIT DATA
X = df_preprocessed.drop(['Price', 'Rating'], axis=1)
y_price = df_preprocessed['Price']
y_rating = df_preprocessed['Rating']

X_train, X_test, y_price_train, y_price_test, y_rating_train, y_rating_test = train_test_split(X, y_price, y_rating, test_size=0.2, random_state=42)

# XGBOOST MODEL FOR PRICE PREDICTION
price_model = XGBRegressor()
price_model.fit(X_train, y_price_train)

# XGBOOST MODEL FOR RATING PREDICTION
rating_model = XGBRegressor()
rating_model.fit(X_train, y_rating_train)

# PREDICTION FOR 2024
X_2024 = X  # USE ALL PREPROCEESD DATASET FOR PREDICTION
price_2024 = price_model.predict(X_2024)
rating_2024 = rating_model.predict(X_2024)

# DATAFRAME WITH THE PREDICTED VALUES FOR 2024
df_predictions_2024 = df_top_products[['Product Name']].copy()
df_predictions_2024['Predicted Price 2024'] = price_2024
df_predictions_2024['Predicted Rating 2024'] = rating_2024

# FILTER THE PREDICTION FOR THE TOP PRODUCTS
df_top_product_predictions_2024 = df_predictions_2024[df_predictions_2024['Product Name'].isin(top_products)]


print(df_top_product_predictions_2024)




# EVALUATION PRICE PREDICTION
price_pred_train = price_model.predict(X_train)
price_pred_test = price_model.predict(X_test)

price_mae = mean_absolute_error(y_price_test, price_pred_test)
price_mse = mean_squared_error(y_price_test, price_pred_test)
price_rmse = np.sqrt(price_mse)


print("Price Prediction Evaluation:")
print("MAE: ", price_mae)
print("MSE: ", price_mse)
print("RMSE: ", price_rmse)


# EVALUATION RATING PREDICTION
rating_pred_train = rating_model.predict(X_train)
rating_pred_test = rating_model.predict(X_test)

rating_mae = mean_absolute_error(y_rating_test, rating_pred_test)
rating_mse = mean_squared_error(y_rating_test, rating_pred_test)
rating_rmse = np.sqrt(rating_mse)


print("Rating Prediction Evaluation:")
print("MAE: ", rating_mae)
print("MSE: ", rating_mse)
print("RMSE: ", rating_rmse)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Product Name                    0
Price                           0
Brand                           0
Category                        0
Description                     0
Rating                          0
Review Count                    0
Style Attributes                0
Total Sizes                     0
Available Sizes                 0
Color                           0
Purchase History                0
Age                             0
Fashion Magazines               0
Fashion Influencers             0
Season                          0
Time Period Highest Purchase    0
Customer Reviews                0
Social Media Comments           0
feedback                        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Price'] = df_numerical['Price'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Rating'] = df_numerical['Rating'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
A value i

     Product Name  Predicted Price 2024  Predicted Rating 2024
0            a9t9             -1.587170              -1.607472
1            o2a6              1.341656               1.295767
2            n0y3             -0.336016              -0.800675
3            n4b8             -1.110213              -0.572290
4            d6f7             -1.254651              -0.503330
...           ...                   ...                    ...
1479         s8f1              0.720871              -0.298990
1480         q1h7             -1.477672               1.404624
1481         k2q6             -0.480564              -1.385527
1482         c1s9             -0.997624               1.151417
1483         e9y6              0.704349               0.779193

[1484 rows x 3 columns]
Price Prediction Evaluation:
MAE:  0.8845038756586545
MSE:  1.119356564390383
RMSE:  1.0579964860009616
Rating Prediction Evaluation:
MAE:  0.9759929215116556
MSE:  1.3648250548813647
RMSE:  1.168257272556591


In [4]:
## USING XGBOOST MODEL TO PREDICT PRICE AND RATING IN2024

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# FUNCTION HANDLE OUTLIER
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


#DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# REMOVE ANY URLS
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))

# REMOVE SPECIAL CHARECHTER
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# CONVERT ALL TEXT TO LOWER CASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())

#REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# USING GROUPBY METHOD FOR TOP PRODUCTS
top_products = df.groupby('Product Name').size().nlargest(50).index.tolist()

# DATAFRAME ONLY FOR TOP PRODUCTS
df_top_products = df[df['Product Name'].isin(top_products)]

# SHUFFLE THE ROWS OF THE DATAFRAME
df_top_products = df_top_products.sample(frac=1).reset_index(drop=True)

# FEATURE
df_top_products = df_top_products[['Product Name', 'Price', 'Brand', 'Category', 'Description', 'Rating', 'Review Count',
                                   'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color', 'Purchase History',
                                   'Age', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                                   'Time Period Highest Purchase', 'Customer Reviews', 'Social Media Comments', 'feedback']]

# IDENTIFYING MISSING VALUES
print(df_top_products.isnull().sum())

# HANDLE MISSING VALUES
df_top_products.fillna(value=np.nan, inplace=True)

# DROP MISSING ROWS
df_top_products.dropna(inplace=True)

# REMOVE DUPLICATES
df_top_products.drop_duplicates(inplace=True)

# SELECT CATEGORICAL COLUMN
categorical_cols = ['Brand', 'Category', 'Style Attributes', 'Total Sizes', 'Available Sizes', 'Color',
                    'Purchase History', 'Fashion Magazines', 'Fashion Influencers', 'Season',
                    'Time Period Highest Purchase']

df_categorical = df_top_products[categorical_cols]

#ONE HOT ENCODING CATEGORICAL TO NUMERIC
df_encoded = pd.get_dummies(df_categorical)

# EXTRAC NUMERIC COLUMN
numerical_cols = ['Price', 'Rating', 'Review Count', 'Age']

df_numerical = df_top_products[numerical_cols]

# CONVERT STRING VALUES TO FLOAT
df_numerical['Price'] = df_numerical['Price'].astype(float)
df_numerical['Rating'] = df_numerical['Rating'].astype(float)
df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
df_numerical['Age'] = df_numerical['Age'].astype(float)

# COMBINE THE CATEGORICAL FEATURE AND NUMERICAL FEATURE
df_preprocessed = pd.concat([df_encoded, df_numerical], axis=1)

# NORMALIZE AND SCALE AND STANDARIZE ALL NUMERICAL COLUMN
scaler = MinMaxScaler()
numerical_cols = df_numerical.columns
df_preprocessed[numerical_cols] = scaler.fit_transform(df_numerical)

scaler = StandardScaler()
df_preprocessed[numerical_cols] = scaler.fit_transform(df_preprocessed[numerical_cols])

# SPLIT DATA TRAING AND TESTING SET
X = df_preprocessed.drop(['Price', 'Rating'], axis=1)
y_price = df_preprocessed['Price']
y_rating = df_preprocessed['Rating']
X_train, X_test, y_price_train, y_price_test, y_rating_train, y_rating_test = train_test_split(X, y_price, y_rating, test_size=0.2, random_state=42)

# XGBOOST MODEL TO PRICE PREDICTION
price_model = XGBRegressor()
price_model.fit(X_train, y_price_train)

# XGBOOST MODEL TO RATING PREDICTION
rating_model = XGBRegressor()
rating_model.fit(X_train, y_rating_train)

# PREDICT ON THE TEST SET
y_price_pred = price_model.predict(X_test)
y_rating_pred = rating_model.predict(X_test)

# DATARAME FOR THE PREDICTION VALUES
predictions_df = pd.DataFrame({'Product Name': df_top_products.iloc[X_test.index]['Product Name'],
                               'Original Price': df_top_products.iloc[X_test.index]['Price'],
                               'Original Rating': df_top_products.iloc[X_test.index]['Rating'],
                               'Predicted Price 2024': y_price_pred,
                               'Predicted Rating 2024': y_rating_pred})

# LOOP THROW THE PREDICTED VALUES
for index, row in predictions_df.iterrows():
    product_name = row['Product Name']
    predicted_price = row['Predicted Price 2024']
    predicted_rating = row['Predicted Rating 2024']
    
    # RETURN ORGINAL VALUES BASED ON THIS 
    original_price = row['Original Price']
    original_rating = row['Original Rating']
    
  
    print("Product Name:", product_name)
    print("Original Price:", original_price)
    print("Original Rating:", original_rating)
    print("Predicted Price 2024:", predicted_price)
    print("Predicted Rating 2024:", predicted_rating)
    print()
    
# EVALUATION PRICE PREDICTION
price_pred_train = price_model.predict(X_train)
price_pred_test = price_model.predict(X_test)

price_mae = mean_absolute_error(y_price_test, price_pred_test)
price_mse = mean_squared_error(y_price_test, price_pred_test)
price_rmse = np.sqrt(price_mse)


print("Price Prediction Evaluation:")
print("MAE: ", price_mae)
print("MSE: ", price_mse)
print("RMSE: ", price_rmse)


# EVALUATION RATING PREDICTION
rating_pred_train = rating_model.predict(X_train)
rating_pred_test = rating_model.predict(X_test)

rating_mae = mean_absolute_error(y_rating_test, rating_pred_test)
rating_mse = mean_squared_error(y_rating_test, rating_pred_test)
rating_rmse = np.sqrt(rating_mse)


print("Rating Prediction Evaluation:")
print("MAE: ", rating_mae)
print("MSE: ", rating_mse)
print("RMSE: ", rating_rmse)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Product Name                    0
Price                           0
Brand                           0
Category                        0
Description                     0
Rating                          0
Review Count                    0
Style Attributes                0
Total Sizes                     0
Available Sizes                 0
Color                           0
Purchase History                0
Age                             0
Fashion Magazines               0
Fashion Influencers             0
Season                          0
Time Period Highest Purchase    0
Customer Reviews                0
Social Media Comments           0
feedback                        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Price'] = df_numerical['Price'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Rating'] = df_numerical['Rating'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numerical['Review Count'] = df_numerical['Review Count'].astype(float)
A value i

Product Name: i5f4
Original Price: 98.353122315323
Original Rating: 3.4296197490557527
Predicted Price 2024: -0.7564535140991211
Predicted Rating 2024: -0.5999040007591248

Product Name: i3k7
Original Price: 20.88885665237987
Original Rating: 4.800780926765045
Predicted Price 2024: 0.6783867478370667
Predicted Rating 2024: -0.11618874222040176

Product Name: a4b9
Original Price: 29.284901139876983
Original Rating: 4.968563422063441
Predicted Price 2024: -0.06135401874780655
Predicted Rating 2024: 0.6948900818824768

Product Name: t2k6
Original Price: 92.0730414462844
Original Rating: 1.992618771485977
Predicted Price 2024: 0.018057847395539284
Predicted Rating 2024: 0.42608726024627686

Product Name: h5d5
Original Price: 43.24417112501563
Original Rating: 2.851113271579198
Predicted Price 2024: 1.129377007484436
Predicted Rating 2024: 0.018360432237386703

Product Name: v6x1
Original Price: 47.41387960944631
Original Rating: 1.6705929023532242
Predicted Price 2024: 0.693782389163971
Pr

In [12]:
# TRAIN MODLE USING RNNs MODEL

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# DATASET 
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# REMOVE URL 
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

# REMOVE SPECIAL CERECHTER
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

# CONVERT TEXT INTO LOWER CASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())
df['Brand'] = df['Brand'].str.lower()
df['Category'] = df['Category'].str.lower()

# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))

# FILTER RELEVENT COLUMN
df = df[['Brand', 'Category', 'Price']]

# CREATE INPUT SEQUENCE
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Brand'] + ' ' + df['Category'])
sequences = tokenizer.texts_to_sequences(df['Brand'] + ' ' + df['Category'])
X = pad_sequences(sequences)

# NORMALIZE PRICE VALUES
scaler = MinMaxScaler()
y = scaler.fit_transform(df['Price'].values.reshape(-1, 1))

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RNNs MODEL
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# TRAIN  MODEL
model.fit(X_train, y_train, epochs=10, batch_size=32)

# EVALUATE THE MODEL
loss = model.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Squared Error: 0.08330157399177551


In [13]:
# BUILD ITERECT TO USER USING RNNs MODEL , WHEN USER SUBMIT BRAND AND CETEGORY NAME HE GET REPLY PREDICTED PRICE ABOUT 
#THOSE PRODUCT


import pandas as pd
import numpy as np
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\FASION DATASET UK AND US\mock_fashion_data_uk_us.csv")

# REMOVE MISING VALUES
df.dropna(inplace=True)

# REMOVE DUPLICATES
df.drop_duplicates(inplace=True)

# REMOVE URLs
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'http\S+', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

# REMOVE SPECIAL CHERECHTER
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Product Name'] = df['Product Name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['feedback'] = df['feedback'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Brand'] = df['Brand'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
df['Category'] = df['Category'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

# CONVERT TEXT INTO LOWER CASE
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: x.lower())
df['Product Name'] = df['Product Name'].apply(lambda x: x.lower())
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: x.lower())
df['feedback'] = df['feedback'].apply(lambda x: x.lower())
df['Brand'] = df['Brand'].str.lower()
df['Category'] = df['Category'].str.lower()

# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['Customer Reviews'] = df['Customer Reviews'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Product Name'] = df['Product Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Social Media Comments'] = df['Social Media Comments'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['feedback'] = df['feedback'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['Brand'] = df['Brand'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
df['Category'] = df['Category'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))

# FILTER RELEVENT COLUMN
df = df[['Brand', 'Category', 'Price']]

# CREATE INPUT SEQUENCE
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Brand'] + ' ' + df['Category'])
sequences = tokenizer.texts_to_sequences(df['Brand'] + ' ' + df['Category'])
X = pad_sequences(sequences)

# NORMALIZE PRICE COLUMN
scaler = MinMaxScaler()
y = scaler.fit_transform(df['Price'].values.reshape(-1, 1))

# SPLIT DATA 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RNNs MODEL
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# TRAIN MODEL
model.fit(X_train, y_train, epochs=10, batch_size=32)

# FUNCTION TO PREDICT PRICE
def predict_price(brand, category):
    input_text = brand.lower() + ' ' + category.lower()
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=X.shape[1])
    predicted_price = scaler.inverse_transform(model.predict(input_sequence))
    return predicted_price[0][0]

# INTERECT WITH USER
while True:
    brand = input("Enter the brand: ")
    category = input("Enter the category: ")
    predicted_price = predict_price(brand, category)
    print("Predicted Price:", predicted_price)
    continue_interaction = input("Do you want to continue? (yes/no): ")
    if continue_interaction.lower() != 'yes':
        break


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Enter the brand: Tommy Hilfiger
Enter the category: Bottoms
Predicted Price: 54.463715
Do you want to continue? (yes/no): YES
Enter the brand: Calvin Klein
Enter the category: Outerwear
Predicted Price: 54.37183
Do you want to continue? (yes/no): YES
Enter the brand: Ted Baker
Enter the category: Lingerie
Predicted Price: 54.373688
Do you want to continue? (yes/no): YES
Enter the brand: Burberry
Enter the category: Accessories
Predicted Price: 54.463715
Do you want to continue? (yes/no): YES
Enter the brand: Mulberry
Enter the category: Footwear
Predicted Price: 54.439735
Do you want to continue? (yes/no): NO
