# **Importing Libraries**

In [54]:
import pandas as pd
import numpy as np
import re
import json
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "colab"
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE

# **Data Loading**

In [2]:
df = pd.read_json('/content/renttherunway_final_data.json',lines=True)
df.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


# **Data Exploration**

**Data Shape and Info**

In [3]:
print("Data Shape:", df.shape)
print("\nData Info:")
print(df.info())

Data Shape: (192544, 15)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             192544 non-null  object 
 1   user_id         192544 non-null  int64  
 2   bust size       174133 non-null  object 
 3   item_id         192544 non-null  int64  
 4   weight          162562 non-null  object 
 5   rating          192462 non-null  float64
 6   rented for      192534 non-null  object 
 7   review_text     192544 non-null  object 
 8   body type       177907 non-null  object 
 9   review_summary  192544 non-null  object 
 10  category        192544 non-null  object 
 11  height          191867 non-null  object 
 12  size            192544 non-null  int64  
 13  age             191584 non-null  float64
 14  review_date     192544 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 22.0+ MB
None


**Descriptive Stat of Numerical Columns**

In [4]:
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
             user_id       item_id         rating           size  \
count  192544.000000  1.925440e+05  192462.000000  192544.000000   
mean   499494.100149  1.045684e+06       9.092371      12.245175   
std    289059.719328  8.053148e+05       1.430044       8.494877   
min         9.000000  1.233730e+05       2.000000       0.000000   
25%    250654.250000  1.950760e+05       8.000000       8.000000   
50%    499419.000000  9.483960e+05      10.000000      12.000000   
75%    750974.000000  1.678888e+06      10.000000      16.000000   
max    999997.000000  2.966087e+06      10.000000      58.000000   

                 age  
count  191584.000000  
mean       33.871017  
std         8.058083  
min         0.000000  
25%        29.000000  
50%        32.000000  
75%        37.000000  
max       117.000000  


**Data Formatting**

In [5]:
# Convert weight: remove non-numeric characters (e.g., "lbs") and convert to float.
df['weight_num'] = pd.to_numeric(
    df['weight'].str.replace(r'[^0-9]', '', regex=True),
    errors='coerce'
)

def convert_height(height_str):
    # Check if height_str is a string; if not, return NaN
    if not isinstance(height_str, str):
        return np.nan
    # Use a regular expression to extract feet and inches
    match = re.match(r"(\d+)'[\s]*(\d+)", height_str)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return feet * 12 + inches
    else:
        return np.nan

# Apply the conversion function to the 'height' column
df['height_in'] = df['height'].apply(convert_height)

# 3. Convert review_date to datetime and extract year and month.
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

**Detecting Missing Values**

In [6]:
# Calculate missing values for each column
missing_counts = df.isnull().sum().reset_index()
missing_counts.columns = ['Column', 'MissingCount']
print("Missing values by column:")
print(missing_counts)

# Visualize missing values as a bar chart using Plotly
fig = px.bar(missing_counts, x='Column', y='MissingCount',
             title='Missing Values per Column',
             text='MissingCount')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.show()

Missing values by column:
            Column  MissingCount
0              fit             0
1          user_id             0
2        bust size         18411
3          item_id             0
4           weight         29982
5           rating            82
6       rented for            10
7      review_text             0
8        body type         14637
9   review_summary             0
10        category             0
11          height           677
12            size             0
13             age           960
14     review_date             0
15      weight_num         29982
16       height_in           677


**Missing Values Imputation**

In [7]:
numeric_impute_cols = ['weight_num', 'rating', 'height_in', 'age']

for col in numeric_impute_cols:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)
    print(f"Imputed missing values in '{col}' with median: {median_value}")

categorical_impute_cols = ['bust size', 'rented for', 'body type']

for col in categorical_impute_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    print(f"Imputed missing values in '{col}' with mode: {mode_value}")

Imputed missing values in 'weight_num' with median: 135.0
Imputed missing values in 'rating' with median: 10.0
Imputed missing values in 'height_in' with median: 65.0
Imputed missing values in 'age' with median: 32.0
Imputed missing values in 'bust size' with mode: 34b
Imputed missing values in 'rented for' with mode: wedding
Imputed missing values in 'body type' with mode: hourglass


**Feature Engineering: New Column Creation**

In [8]:
# Compute BMI using the formula: BMI = (weight in lbs * 703) / (height in inches)^2
df['BMI'] = df.apply(lambda row: (row['weight_num'] * 703 / (row['height_in']**2))
                     if pd.notnull(row['height_in']) and row['height_in'] != 0 else np.nan, axis=1)

# Parse the 'bust size' column: separate the number from the cup size (e.g., "34d" -> 34 and "d")
def parse_bust(bust_str):
    if pd.isnull(bust_str):
        return np.nan, np.nan
    match = re.match(r"(\d+)([a-zA-Z]+)", bust_str.strip())
    if match:
        number = float(match.group(1))
        cup = match.group(2)
        return number, cup
    else:
        return np.nan, np.nan

# Apply the parsing function and create two new columns
df[['bust_number', 'cup_size']] = df['bust size'].apply(lambda x: pd.Series(parse_bust(x)))

df['review_year'] = df['review_date'].dt.year
df['review_month'] = df['review_date'].dt.month

**outlier Detection**

In [9]:
# Outlier Detection using Z-score
from scipy import stats

outlier_cols = ['age', 'rating', 'weight_num', 'height_in', 'BMI', 'size']

for col in outlier_cols:
    # Compute z-scores (ignoring NaNs)
    z_scores = np.abs(stats.zscore(df[col].dropna()))
    # Identify outliers as values with a z-score > 3
    outlier_indices = df[col].dropna().index[z_scores > 3]

    if len(outlier_indices) > 0:
        print(f'\nOutliers in {col}:')
        print(df.loc[outlier_indices, [col]])
    else:
        print(f'\nNo significant outliers detected in {col}.')



Outliers in age:
          age
2       116.0
7        65.0
121      69.0
187      59.0
204      59.0
...       ...
192222   62.0
192224   63.0
192293   61.0
192363   62.0
192407   59.0

[2774 rows x 1 columns]

Outliers in rating:
        rating
42         4.0
131        4.0
152        2.0
179        4.0
181        2.0
...        ...
192166     4.0
192246     4.0
192290     2.0
192507     4.0
192535     4.0

[3837 rows x 1 columns]

Outliers in weight_num:
        weight_num
110          250.0
116          231.0
136          210.0
143          215.0
147          220.0
...            ...
192450       220.0
192463       227.0
192511       202.0
192520       245.0
192528       220.0

[3216 rows x 1 columns]

Outliers in height_in:
        height_in
622          56.0
772          74.0
3772         56.0
4161         75.0
6180         56.0
...           ...
189201       74.0
190945       55.0
191008       54.0
191744       75.0
192109       74.0

[288 rows x 1 columns]

Outliers in BMI:
   

**Handling outliers**

In [10]:
numeric_cols = ['age', 'rating', 'weight_num', 'height_in', 'BMI', 'size']

# We compute z-scores using the formula: z = (x - mean) / std.
# Then, we flag rows where any column has an absolute z-score of 3 or higher.
z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std(ddof=0))

# If a value is missing, z_score will be NaN. We fill those with 0 so they don't trigger outlier removal.
z_scores_filled = z_scores.fillna(0)

# Create a boolean mask: True for rows where ALL numeric columns have z-score < 3
mask = (z_scores_filled < 3).all(axis=1)

# Create a new DataFrame without outliers
df_cleaned = df[mask].copy()

**Review Word Count Distribution**

In [11]:
# Review Word Count Distribution using Plotly
df_cleaned['review_word_count'] = df_cleaned['review_text'].apply(lambda x: len(str(x).split()))
fig_wc = px.histogram(df_cleaned, x='review_word_count', nbins=10,
                      title='Review Word Count Distribution',
                      labels={'review_word_count': 'Word Count'},
                      marginal="box")
fig_wc.show()

**Generate and Display Word Cloud**

In [12]:
# Generate and Display Word Cloud with Plotly
from wordcloud import WordCloud

# Combine all review texts into one large string
text = " ".join(review for review in df_cleaned['review_text'].dropna())

# Create the word cloud image
wc = WordCloud(width=800, height=400, background_color='white').generate(text)
# Convert the wordcloud image to a numpy array
wc_array = wc.to_array()

# Display the word cloud image using Plotly
fig_wordcloud = px.imshow(wc_array)
fig_wordcloud.update_xaxes(visible=False)
fig_wordcloud.update_yaxes(visible=False)
fig_wordcloud.update_layout(title="Word Cloud of Review Texts")
fig_wordcloud.show()

**Average Rating by Product Category**

In [13]:
#  Average Rating by Product Category
grouped_category = df_cleaned.groupby('category')['rating'].mean().reset_index()

fig1 = px.bar(
    grouped_category,
    x='category',
    y='rating',
    title='Average Rating by Product Category',
    labels={'category': 'Product Category', 'rating': 'Average Rating'},
    color='category',
    text='rating'
)
fig1.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig1.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig1.show()


**Rental Reasons Count**

In [14]:
#  Rental Reasons Count
rental_counts = df_cleaned['rented for'].value_counts().reset_index()
rental_counts.columns = ['rented for', 'count']

fig3 = px.bar(
    rental_counts,
    x='rented for',
    y='count',
    title='Count of Rental Reasons',
    labels={'rented for': 'Rental Reason', 'count': 'Count'},
    color='rented for',
    text='count'
)
fig3.update_traces(texttemplate='%{text}', textposition='outside')
fig3.show()


**Average BMI by Body Type**

In [15]:
# Average BMI by Body Type
grouped_bmi = df_cleaned.groupby('body type')['BMI'].mean().reset_index()

fig4 = px.bar(
    grouped_bmi,
    x='body type',
    y='BMI',
    title='Average BMI by Body Type',
    labels={'body type': 'Body Type', 'BMI': 'Average BMI'},
    color='body type',
    text='BMI'
)
fig4.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig4.show()


**Scatter Plot of Age vs Rating**

In [16]:
# Scatter Plot of Age vs Rating
fig5 = px.scatter(
    df_cleaned,
    x='age',
    y='rating',
    size='size',
    hover_data=['user_id', 'rented for'],
    title='Scatter Plot: User Age vs Rating',
    labels={'age': 'User Age', 'rating': 'Rating'}
)
fig5.show()


**top 5 categories by count**

In [17]:
top_categories = df_cleaned['category'].value_counts().head(5).index.tolist()

# Filter the DataFrame to include only the top categories
filtered_df = df_cleaned[df_cleaned['category'].isin(top_categories)]

# Group the filtered data by category and rental reason
sunburst_df_filtered = filtered_df.groupby(['category', 'rented for']).size().reset_index(name='count')

# Create the interactive sunburst chart with Plotly
fig6 = px.sunburst(
    sunburst_df_filtered,
    path=['category', 'rented for'],
    values='count',
    title='Sunburst: Rental Reasons by Top 5 Product Categories',
    color='count',
    color_continuous_scale='RdBu'
)

fig6.show()

**Review Count Over Time**

In [19]:
df['review_word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))
# Group by month-year and count the number of reviews
review_count = df.groupby('review_year').size().reset_index(name='count')

# Create an interactive line plot using Plotly
fig1 = px.line(
    review_count,
    x='review_year',
    y='count',
    title='Review Count Over Time',
    labels={'review_year': 'review_year', 'count': 'Number of Reviews'},
    markers=True
)
fig1.update_layout(xaxis_title='review_year', yaxis_title='Review Count')
fig1.show()


**Average Rating Trend Over Time**

In [20]:
rating_trend = df.groupby('review_year')['rating'].mean().reset_index()

# Create an interactive line plot using Plotly
fig2 = px.line(
    rating_trend,
    x='review_year',
    y='rating',
    title='Average Rating Trend Over Time',
    labels={'review_year': 'review_year', 'rating': 'Average Rating'},
    markers=True
)
fig2.update_layout(xaxis_title='Month', yaxis_title='Average Rating')
fig2.show()


**Correlation Heatmap**

In [21]:
corr = df_cleaned[numeric_cols].corr()

fig_corr = px.imshow(corr,
                     text_auto=True,
                     color_continuous_scale='RdBu_r',
                     title='Correlation Heatmap',
                     labels=dict(color="Correlation"))
fig_corr.show()

**Feature Selection**

In [60]:
df = df_cleaned.copy()

# Drop irrelevant columns
drop_cols = ['user_id', 'item_id', 'review_text', 'review_summary', 'review_date','height','weight','review_word_count','review_year','review_month']
df.drop(columns=drop_cols, inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['bust size', 'body type', 'cup_size', 'rented for','category']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Encode target variable
target_encoder = LabelEncoder()
df['fit'] = target_encoder.fit_transform(df['fit'])

**Handling Class Imbalance**

In [71]:
# Define features and target
X = df.drop(columns=['fit'])
y = df['fit']

# Handle Class Imbalance with SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

**Data Scaling**

In [62]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Model Development**

In [63]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

**Model Evaluation**

In [64]:
# Predictions
y_pred = rf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.7806362070461108
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.83      0.79     26899
           1       0.82      0.73      0.77     26899
           2       0.78      0.78      0.78     26899

    accuracy                           0.78     80697
   macro avg       0.78      0.78      0.78     80697
weighted avg       0.78      0.78      0.78     80697

Confusion Matrix:
 [[22394  1932  2573]
 [ 3815 19745  3339]
 [ 3524  2519 20856]]


**Saving Objects**

In [82]:
import joblib

# Save preprocessing objects
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')

print("Scaler, Label Encoders, and Target Encoder saved successfully!")

Scaler, Label Encoders, and Target Encoder saved successfully!


**Save Model**

In [65]:
import joblib

# Save the trained model
joblib.dump(rf, 'random_forest_model.pkl')
print("Model saved successfully!")

Model saved successfully!


**Loading Model**

In [66]:
# Load the model
rf_loaded = joblib.load('/content/random_forest_model.pkl')
print("Model loaded successfully!")

Model loaded successfully!


**Model Evaluation on Test Dataset**

In [76]:
import pandas as pd
import numpy as np

user_input = {
    'bust size': '34b',  # Categorical
    'rating': 9.0,  # Numerical
    'rented for': 'wedding',  # Categorical
    'body type': 'athletic',  # Categorical
    'category': 'dress',  # Categorical
    'size': 6,  # Numerical
    'age': 30,  # Numerical
    'weight_num': 140,  # Numerical
    'height_in': 65,  # Numerical
    'BMI': 22.0,  # Numerical
    'bust_number': 34,  # Numerical
    'cup_size': 'b'  # Categorical
}

# Convert categorical values using label encoders
for col in ['bust size', 'rented for', 'body type', 'category', 'cup_size']:
    user_input[col] = label_encoders[col].transform([user_input[col]])[0]

# Convert to DataFrame
user_df = pd.DataFrame([user_input])

# Scale numerical values using the same scaler as training
user_scaled = scaler.transform(user_df)

# Predict Fit
predicted_fit = rf_loaded.predict(user_scaled)

# Decode prediction to original fit labels
predicted_fit_label = target_encoder.inverse_transform(predicted_fit)

print("Predicted Fit:", predicted_fit_label[0])

Predicted Fit: large
