In [2]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:

# Load data
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [8]:
# Check for missing values in the 'Review Text' column
missing_values = df['Review Text'].isnull().sum()
if missing_values > 0:
    df = df.dropna(subset=['Review Text'])  # Drop rows with missing values in 'Review Text'
    print(f"Dropped {missing_values} rows with missing 'Review Text' values.")

Dropped 845 rows with missing 'Review Text' values.


In [9]:

# Define target variables
X = df['Review Text']
y = df['Recommended IND']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:

# Use TfidfVectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [11]:

# Initialize and train a logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [12]:

# Make predictions on the test set
predictions = model.predict(X_test_tfidf)


In [13]:

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)


In [14]:

print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.89
Confusion Matrix:
[[ 428  384]
 [ 132 3585]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.53      0.62       812
           1       0.90      0.96      0.93      3717

    accuracy                           0.89      4529
   macro avg       0.83      0.75      0.78      4529
weighted avg       0.88      0.89      0.88      4529

