## Import libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix,recall_score,precision_score,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import pairwise_distances_argmin_min

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data Loading

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/X_train.csv")
labels = pd.read_csv("https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/y_train.csv")

In [6]:
df.head()

Unnamed: 0,review
0,"Shame, is a Swedish film in Swedish with Engli..."
1,I know it's rather unfair to comment on a movi...
2,"""Bread"" very sharply skewers the conventions o..."
3,After reading tons of good reviews about this ...
4,During the Civil war a wounded union soldier h...


In [7]:
df.shape

(40000, 1)

In [8]:
df.describe()

Unnamed: 0,review
count,40000
unique,39719
top,Loved today's show!!! It was a variety and not...
freq,5


In [9]:
df.isnull().sum()

review    0
dtype: int64

In [10]:
labels.head()

Unnamed: 0,sentiment
0,1
1,0
2,1
3,1
4,1


In [11]:
labels.value_counts()

sentiment
0            20000
1            20000
dtype: int64

In [12]:
len(df)

40000

In [13]:
df.duplicated().sum()

281

In [14]:
df.drop_duplicates(keep='last',inplace=True)

## Data cleaning

In [14]:
def convert_to_lower(text):
    return text.lower()
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = nltk.word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

df['review'] = df['review'].apply(convert_to_lower)
df['review'] = df['review'].apply(remove_numbers)
df['review'] = df['review'].apply(remove_punctuation)
df['review'] = df['review'].apply(remove_stopwords)
df['review'] = df['review'].apply(remove_extra_white_spaces)
df['review'] = df['review'].apply(lemmatizing)


- Replaced the missing values with empty string
- Converted the text of reviews body to lower case
- Removed digits from the body
- Removed punctuations
- Removed stopwords
- Removed extra white spaces
- Lemmatized the text to convert all words to their root word

#### Performed sanity check to see if text is processed properly

In [15]:
random_samples = df['review'].sample(5).tolist()
for text in random_samples:
    print("Preprocessed Text:")
    print(text)

Preprocessed Text:
poor basketball movie gruff coach dubious background come small indiana high school basketball team coach boy victory breaking first building upbr br bad subject photography ok plot totally predictable real subplots nothing added make movie exciting know going happen beginning suitable th grader
Preprocessed Text:
throughout watching end day got sense film maker perhaps trying make unique average hollywood action film failed course give credit trying peter hyams actually tried directing time instead churning another flat action film attempted inject atmosphere movie darkening light adding ton blood method work used correctly see se en feel like cheap trick try scare u hyams decent action director offer nothing basic shoot out fight scene except lackluster sub par fx end battle photographer hyams demonstrates actual ability displaying good frame work movement nothing solid workbr br screenwriter andrew marlowe film greatest enemy part script actually show making good 

### Train-test-split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, labels['sentiment'], test_size =  0.2)

In [28]:
X_train.shape

(32000, 1)

In [None]:
y_train.shape

(32000,)

### Finding optimal number of clusters

In [None]:
wcss = []
X = tfidf.fit(X['review']).transform(df['review'])
for num_clusters in range(2, 100):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(2, 50), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

### KMeans clustering for semisupervised Learning

- kmeans clustering is performed on the sample data and its labels are used 
- We want to assign labels to the unlabeled data using the information from the labeled data.
- we perform a clustering algorithm called K-means on the transformed labeled data. K-means groups similar examples into clusters based on their features.
- Once the clustering is done, we identify representative examples from each cluster. These representative examples are the ones that are closest to the center of each cluster. We do this to capture the main characteristics of each cluster.
- Using the trained K-means model, we predict which cluster each unlabeled example belongs to. This gives us cluster labels for the unlabeled data.
- Now comes the labeling part. We assign labels to the unlabeled examples based on the labels of the representative examples within the same clusters

In [18]:
from sklearn.cluster import KMeans
import numpy as np
k=100 ## chosen 100 clusters
n_accessible = 5000 
X_accessible = X_train[:n_accessible]
y_accessible = y_train[:n_accessible]
p = Pipeline([('tfidf', TfidfVectorizer(stop_words='english',
                                   ngram_range=(1, 2),lowercase=True)),
        ("kmeans", KMeans(n_clusters=k)),
    ])
# kmeans = KMeans(n_clusters=k)
X_dist = p.fit_transform(X_accessible.values.reshape(-1)) 
# representative_idx = pairwise_distances_argmin_min(X_accessible, kmeans.cluster_centers_)[0]
representative_idx = np.argmin(X_dist, axis=0) 
X_representative = X_accessible.values[representative_idx]



#### print top words for top clusters

In [19]:
cluster_centers = p.named_steps['kmeans'].cluster_centers_
feature_names = p.named_steps['tfidf'].get_feature_names_out()

In [20]:
top_words_per_cluster = []
for cluster_center in cluster_centers[:5]:
    top_word_indices = cluster_center.argsort()[:-6:-1]  # Retrieve the indices of the top 5 features
    top_words = [feature_names[index] for index in top_word_indices]
    top_words_per_cluster.append(top_words)

In [21]:
for cluster_index, top_words in enumerate(top_words_per_cluster):
    print(f"Cluster {cluster_index + 1} Top Words:", top_words)

Cluster 1 Top Words: ['martial', 'martial art', 'movie', 'art', 'steven']
Cluster 2 Top Words: ['novel', 'jane', 'rochester', 'br', 'film']
Cluster 3 Top Words: ['film', 'town', 'freddy', 'yellow', 'goya']
Cluster 4 Top Words: ['worst', 'ive', 'movie', 'ive seen', 'seen']
Cluster 5 Top Words: ['bad', 'movie', 'acting', 'br', 'special effect']


#### Get cluster representatives

In [22]:
representative_idx   # Get the cluster representatives of the data

array([2258,  174,  259, 3469, 4739, 3326, 2261, 3610, 3745, 2804, 2039,
       3451,  438, 2725, 1624, 1332,  108,  190, 3897, 1032, 4168, 3575,
       4866, 3417, 2138, 2819, 3050, 2152, 1984, 4214, 2303, 3668, 4719,
       3347, 1972,  352,  968, 1920, 3189, 1074, 3748, 3631, 1757, 3672,
        504, 2427,  846, 4205, 4309, 3428,  295,  411, 4317, 1599,   85,
        318,  618,  319, 4531, 2381,   32,  300, 2542, 4565, 2740,  754,
       1882, 4207, 4203, 4170,  830, 4730, 4543, 3535, 2555, 1279, 4867,
       2665, 1848,  921, 1742, 2222, 2935, 1526, 2425, 4837, 2670, 1647,
       3853, 2016,  970,  614,  408, 3937,  343,  788, 4747, 3486, 3214,
       3636])

In [23]:
representative_labels = y_accessible.iloc[representative_idx]

In [24]:
representative_labels

8070     0
21166    0
1630     0
19824    0
4219     1
        ..
29527    1
26289    1
13852    1
28016    1
24881    0
Name: sentiment, Length: 100, dtype: int64

In [25]:
X_unlabeled = X_train[n_accessible:]

In [26]:
# Predict the cluster assignments for the unlabeled data
unlabeled_cluster_labels = p.predict(X_unlabeled.values.reshape(-1))

In [27]:
unlabeled_labels = y_accessible.iloc[unlabeled_cluster_labels]

In [28]:
X_combined = pd.concat([X_accessible, X_unlabeled])
y_combined = pd.concat([y_accessible, unlabeled_labels])

X_combined

Unnamed: 0,review
17788,finally uncut version baby face surface source...
23660,would given otherwise terrific series full vot...
12892,ok year ago awesome funny movie mask came ever...
31262,looking crazy stunt typified harold lloyd sile...
29740,caught movie tube sunday thought bad looked im...
...,...
10678,every review read far seems missed crucial poi...
21000,non existent plot ton poorly directed superche...
18898,outragously entertaining period piece set spin...
15651,storyline thief bagdad complex owing told flas...


### Vectorization(TF-IDF):

 - Vectorization is performed here because the body of the review is the text feature
 - We have chosen TF-IDF vectorizer as it removes the most frequently occuring words(max_df ) and less frequent words (min_df) which is required to build accurate model and in order for it not to get biased towards those words 

In [None]:
# Preprocess the text data and convert it to numerical representation
vectorizer =  TfidfVectorizer(stop_words='english',max_df = 0.99,min_df=10, ngram_range=(1, 2),lowercase=True)
X_processed = vectorizer.fit_transform(X_combined['review'])

# Train a classification model
model = LogisticRegression()
model.fit(X_processed, y_combined)


### Sanity check on vectorizer

In [31]:
vocab_size = len(vectorizer.get_feature_names_out())
print("Vocabulary Size:", vocab_size)
  

Vocabulary Size: 2108101


In [33]:
print("Processed Data Shape:", X_processed.shape)

Processed Data Shape: (32000, 2108101)


In [None]:
def classifier_scores(y_train,y_test, pred_train, pred_test):    
    print()
    print("Train data accuracy score: ", accuracy_score(y_train,pred_train))    
    print("Test data accuracy score: ", accuracy_score(y_test,pred_test))
    print()
    print("Recall score on train data: ", recall_score(y_train,pred_train, average='macro'))
    print("Recall score on test data: ",recall_score( y_test,pred_test, average='macro'))
    print()
    
    print("Precision score on train data: ",precision_score(y_train,pred_train, average='macro'))
    print("Precision score on test data: ",precision_score(y_test,pred_test, average='macro'))
    print()
    print("F1 score on train data: ",f1_score(y_train,pred_train, average='macro'))
    print("F1 score on test data: ",f1_score(y_test,pred_test, average='macro'))

In [None]:
y_train_pred = model.predict(X_processed)


In [None]:
X_test_processed = vectorizer.transform(X_test['review'])

# Use the model for predictions on test data
y_test_pred = model.predict(X_test_processed)

In [None]:
classifier_scores(y_combined,y_test,y_train_pred,y_test_pred)


Train data accuracy score:  0.88584375
Test data accuracy score:  0.567375

Recall score on train data:  0.861851558057976
Recall score on test data:  0.5701169929306145

Precision score on train data:  0.8987045454545455
Precision score on test data:  0.5826596727562708

F1 score on train data:  0.8743272719116182
F1 score on test data:  0.5515459089407878


#### Test LR Model on Production data set

In [None]:
X_final = pd.read_csv(r"https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/X_final.csv")

In [None]:
y_final = pd.read_csv(r"https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/y_final.csv")

In [None]:
X_final_processed = vectorizer.transform(X_final['review'])

# Use the model for predictions on test data
y_final_pred = model.predict(X_final_processed)

In [None]:
classifier_scores(y_combined,y_final,y_train_pred,y_final_pred)


Train data accuracy score:  0.88584375
Test data accuracy score:  0.5432

Recall score on train data:  0.861851558057976
Recall score on test data:  0.5432

Precision score on train data:  0.8987045454545455
Precision score on test data:  0.5433776749566224

F1 score on train data:  0.8743272719116182
F1 score on test data:  0.5427317573194952


-  accuracy or F1 score will be good metric or good measure for the fit of the model. we have focused on just improving the Accuracy and F1 Score.

#### Pipeline for Logistic regression model

In [None]:
log_reg = LogisticRegression()
pipeline = Pipeline([
    ('vecto', vectorizer),
    ('log_reg', log_reg)
])
pipeline.fit(X_combined.values.reshape(-1), y_combined.values.reshape(-1))

In [None]:
y_train_pred1 = pipeline.predict(X_combined['review'])
y_final_pred1 = pipeline.predict(X_final['review'])
classifier_scores(y_combined,y_final,y_train_pred1,y_final_pred1)


Train data accuracy score:  0.88584375
Test data accuracy score:  0.5432

Recall score on train data:  0.861851558057976
Recall score on test data:  0.5432

Precision score on train data:  0.8987045454545455
Precision score on test data:  0.5433776749566224

F1 score on train data:  0.8743272719116182
F1 score on test data:  0.5427317573194952


#### Logistic Regression  with hypertuning

In [None]:
param_grid = {
     'log_reg__C': [0.1, 1.0, 10.0],  # Example values for the 'C' hyperparameter
     'log_reg__solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_combined.values.reshape(-1), y_combined.values.reshape(-1))

best_params = grid_search.best_params_
best_score = grid_search.best_score_


In [None]:
gs_score = grid_search.score(X_final['review'],y_final.values.reshape(-1))
# print(score)

In [None]:
y_train_pred_gs = grid_search.predict(X_combined['review'])

In [None]:
y_final_pred_gs = grid_search.predict(X_final['review'])
classifier_scores(y_combined,y_final,y_train_pred1,y_final_pred1)


Train data accuracy score:  0.88584375
Test data accuracy score:  0.5432

Recall score on train data:  0.861851558057976
Recall score on test data:  0.5432

Precision score on train data:  0.8987045454545455
Precision score on test data:  0.5433776749566224

F1 score on train data:  0.8743272719116182
F1 score on test data:  0.5427317573194952


In [None]:
# loading library
import pickle
# create an iterator object with write permission - model.pkl
with open('lr_gridsearch_model_pkl', 'wb') as files:
    pickle.dump(grid_search, files)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rand_f = RandomForestClassifier(random_state=0)
rf_pipeline = Pipeline([
    ('vecto', vectorizer),
    ('random_forest', rand_f)
])
rf_pipeline.fit(X_combined.values.reshape(-1), y_combined.values.reshape(-1))

In [None]:
y_train_pred_rf = rf_pipeline.predict(X_combined['review'])
y_final_pred_rf = rf_pipeline.predict(X_final['review'])
classifier_scores(y_combined,y_final,y_train_pred_rf,y_final_pred_rf)


Train data accuracy score:  0.9991875
Test data accuracy score:  0.5761

Recall score on train data:  0.9990506754347214
Recall score on test data:  0.5761000000000001

Precision score on train data:  0.9992332123043298
Precision score on test data:  0.6251905083682205

F1 score on train data:  0.9991417787701988
F1 score on test data:  0.5300278760323297


In [None]:
# loading library
import pickle
# create an iterator object with write permission - model.pkl
with open('rf_pipeline_model_pkl', 'wb') as files:
    pickle.dump(rf_pipeline, files)

In [None]:
with open('rf_pipeline_model_pkl' , 'rb') as f:
    rf_pipe = pickle.load(f)

In [None]:
y_train_pred_rf = rf_pipe.predict(X_combined['review'])
y_final_pred_rf = rf_pipe.predict(X_final['review'])
classifier_scores(y_combined,y_final,y_train_pred_rf,y_final_pred_rf)


Train data accuracy score:  0.9991875
Test data accuracy score:  0.5761

Recall score on train data:  0.9990506754347214
Recall score on test data:  0.5761000000000001

Precision score on train data:  0.9992332123043298
Precision score on test data:  0.6251905083682205

F1 score on train data:  0.9991417787701988
F1 score on test data:  0.5300278760323297


### Random Forest with hypertuning

In [None]:
param_grid = {
    'random_forest__n_estimators': [100, 200, 300],  # Example values for the number of estimators
    'random_forest__max_depth': [None, 5, 10],  # Example values for the maximum depth
}
rand_f = RandomForestClassifier(random_state=0)
rf_pipeline = Pipeline([
    ('vecto', vectorizer),
    ('random_forest', rand_f)
])
grid_search_rf = GridSearchCV(rf_pipeline, param_grid, cv=5)
grid_search_rf.fit(X_combined.values.reshape(-1), y_combined.values.reshape(-1))

best_params = grid_search_rf.best_params_
best_score = grid_search_rf.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
y_train_pred_rf_gs = grid_search_rf.predict(X_combined['review'])
y_final_pred_rf = rf_pipeline.predict(X_final['review'])
classifier_scores(y_combined,y_final,y_train_pred_rf,y_final_pred_rf)

### Conclusion

- Performed Semi supervised Text Clustering which includes
    - utilized partial data to consider it as semi supervised model  and had fit into kmeans cluster model
    - Predicted the labels of unlabelled data 
- Fitted the different models on full data 
- Achieved best accuracy of 57% with production data
-  Random Forest with hypertuning approach performed the best of the all models With Accuracy of 0.57. Logistic regression performed good when considering F1 Score with f1 score of 0.54. But Random Forest is computationally bit expensive compared to Logistic regression Hence we would consider the Logistic Regression hypertuned model as the final model for the Review Classification.