<a href="https://colab.research.google.com/github/ohabardi/fashion-review-pipeline/blob/main/Data_Science_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()


pd.set_option("display.max_columns", None)


df = pd.read_csv("reviews.csv")

df.head()

Saving reviews.csv to reviews (1).csv


Unnamed: 0,Clothing ID,Age,Title,Review Text,Positive Feedback Count,Division Name,Department Name,Class Name,Recommended IND
0,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,0,General,Dresses,Dresses,0
1,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",0,General Petite,Bottoms,Pants,1
2,847,47,Flattering shirt,This shirt is very flattering to all due to th...,6,General,Tops,Blouses,1
3,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",4,General,Dresses,Dresses,0
4,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,1,General Petite,Tops,Knits,1


In [None]:
# Dataset structure
df.info()

# Null values per column
print("\nMissing values:")
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18442 entries, 0 to 18441
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              18442 non-null  int64 
 1   Age                      18442 non-null  int64 
 2   Title                    18442 non-null  object
 3   Review Text              18442 non-null  object
 4   Positive Feedback Count  18442 non-null  int64 
 5   Division Name            18442 non-null  object
 6   Department Name          18442 non-null  object
 7   Class Name               18442 non-null  object
 8   Recommended IND          18442 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.3+ MB

Missing values:
Clothing ID                0
Age                        0
Title                      0
Review Text                0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
Recommende

In [None]:
# Create review length (number of words)
df['review_length'] = df['Review Text'].apply(lambda x: len(str(x).split()))


df[['Review Text', 'review_length']].head()


Unnamed: 0,Review Text,review_length
0,I had such high hopes for this dress and reall...,98
1,"I love, love, love this jumpsuit. it's fun, fl...",22
2,This shirt is very flattering to all due to th...,36
3,"I love tracy reese dresses, but this one is no...",98
4,I aded this in my basket at hte last mintue to...,101


In [None]:
!pip install textblob




In [None]:
from textblob import TextBlob

# Create sentiment polarity score
df['sentiment_score'] = df['Review Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df[['Review Text', 'sentiment_score']].head()


Unnamed: 0,Review Text,sentiment_score
0,I had such high hopes for this dress and reall...,0.073675
1,"I love, love, love this jumpsuit. it's fun, fl...",0.55
2,This shirt is very flattering to all due to th...,0.512891
3,"I love tracy reese dresses, but this one is no...",0.17875
4,I aded this in my basket at hte last mintue to...,0.13375


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
# Function to count adjectives
def count_adjectives(text):
    doc = nlp(str(text))
    return sum(1 for token in doc if token.pos_ == 'ADJ')

# Apply to create new feature
df['num_adjectives'] = df['Review Text'].apply(count_adjectives)


In [None]:
df[['Review Text', 'num_adjectives']].head()


Unnamed: 0,Review Text,num_adjectives
0,I had such high hopes for this dress and reall...,17
1,"I love, love, love this jumpsuit. it's fun, fl...",2
2,This shirt is very flattering to all due to th...,6
3,"I love tracy reese dresses, but this one is no...",6
4,I aded this in my basket at hte last mintue to...,4


In [None]:
categorical_features = ['Division Name', 'Department Name', 'Class Name']


In [None]:
numeric_features = ['Age', 'Positive Feedback Count', 'review_length', 'sentiment_score', 'num_adjectives']


In [None]:
numeric_features = ['Age', 'Positive Feedback Count', 'review_length', 'sentiment_score', 'num_adjectives']


In [None]:
text_feature = 'Review Text'


In [None]:
target = 'Recommended IND'


In [None]:
df.drop(columns=['Clothing ID', 'Title'], inplace=True)


In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Takes all numeric columns and scales them to standard form (mean=0, std=1). Helps some models perform better.

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [None]:
# Converts category columns (e.g., product class names) into 0/1 dummy variables. Handles any unseen category during testing with handle_unknown='ignore'.

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
# Turns the review text into a numeric matrix using TF-IDF, which captures how important a word is in context.
# Limits to 1000 features
# Removes English stop words (like "the", "and")

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english'))
])


In [None]:
# Combines everything:

# Applies scaling to numeric columns

# Applies one-hot encoding to categorical columns

# All in one step using ColumnTransformer.

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('txt', text_transformer, text_feature)
])


In [None]:
# Creates one clean end-to-end pipeline:

# Preprocess everything using the ColumnTransformer

# Fit a Random Forest model on the processed data


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [None]:
# Train the full pipeline
pipeline.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.35      0.49       678
           1       0.87      0.98      0.92      3011

    accuracy                           0.87      3689
   macro avg       0.84      0.67      0.71      3689
weighted avg       0.86      0.87      0.84      3689

Confusion Matrix:
[[ 240  438]
 [  58 2953]]


In [None]:
from sklearn.model_selection import RandomizedSearchCV


In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__class_weight': [None, 'balanced']
}


In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=10,                # Number of combinations to try
    cv=3,                     # 3-fold cross-validation
    scoring='f1',             # Optimize for F1 (balanced metric)
    verbose=1,
    n_jobs=-1,
    random_state=42
)


In [None]:
search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
print("Best params:", search.best_params_)
print("Best F1 score:", search.best_score_)

Best params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__max_depth': None, 'classifier__class_weight': None}
Best F1 score: 0.9214911321332533


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict using the best model
y_pred_tuned = search.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))


              precision    recall  f1-score   support

           0       0.81      0.35      0.49       678
           1       0.87      0.98      0.92      3011

    accuracy                           0.87      3689
   macro avg       0.84      0.67      0.71      3689
weighted avg       0.86      0.87      0.84      3689

Confusion Matrix:
[[ 240  438]
 [  58 2953]]
