In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal of this project is to construct a machine learning model for predicting the sentiment of movie reviews. This involves addressing a classification challenge, leading to the exploration of various classification algorithms in order to identify the optimal model.

**1) LOAD THE DATA AND STORE THEM INTO APPROPRIATE VARIABLES**

In [None]:
train = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')
test = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')
movies = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv')
sample = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/sample.csv')

**2) EXPLORATORY DATA ANALYSIS**

**2.1) Visualizing key statistics and relationships in the data**

**2.1.1) TRAIN DATASET**

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print(type(train))
print(train['sentiment'].unique())
#Here we can see that it is a binary classification problem since there are two unique values for the
#label i.e 'sentiment' to be predicted.

**visualise the distribution of Labels in train data**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(6, 4))
sns.countplot(x='sentiment', data=train)
plt.xlabel('sentiment')
plt.ylabel('Count')
plt.title('Distribution of Label Classes')
plt.show()

We can see that there is an imbalance in the train data since positive class is more in number .

**2.1.2) TEST DATA SET**

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
test.info() #To understand the feature type

In [None]:
print(type(test))

**2.1.3) MOVIES DATA SET**

In [None]:
movies.head()

In [None]:
movies.describe()

In [None]:
movies.info() #To understand the feature type

In [None]:
type(movies)

Dropping few columns to maintain only the following columns: reviewText,movieid and sentiment


In [None]:
train_new = train.drop(['reviewerName','isFrequentReviewer'],axis =1)

In [None]:
test_new = test.drop(['reviewerName','isTopCritic'],axis =1)

In [None]:
movies_new= movies.drop(['title','rating','ratingContents','releaseDateTheaters','releaseDateStreaming','runtimeMinutes','genre','originalLanguage','director','boxOffice','distributor','soundType'],axis =1)

**1) DUMMY CLASSIFIER**

Demonstrating the result of a dummy classifier model on the data.It provided an accuracy of 0.67 on the evaluation set.This can be used as a benchmark for comparison for the ML models

In [None]:
#X = train_new['reviewText']
#y = train_new['sentiment']

In [None]:
#from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state =42)
#from sklearn.dummy import DummyClassifier
#dummy_clf = DummyClassifier(strategy="prior")
#dummy_clf.fit(X_train,y_train)


In [None]:
#from sklearn.metrics import accuracy_score
#accuracy = accuracy_score(y_test,y_pred)
#accuracy


In [None]:
#test = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')

#test_df = test.drop(['movieid','reviewerName','isTopCritic'],axis =1)
#test_df = test_df.fillna('')
#test_predictions = dummy_clf.predict(test_df)

In [None]:
#submission = pd.DataFrame(columns= ['id','sentiment'])
#submission['id'] = [i for i in range(len(test_predictions))]
#submission['sentiment'] = test_predictions

In [None]:
#submission.shape

In [None]:
#submission.info

In [None]:
# save the submission dataframe into .csv file
#submission.to_csv('submission.csv',index = False)

**3) CHECKING FOR MISSING VALUE AND IMPUTATION**

**3.1) Checking for missing values in the train data**

In [None]:
print(train_new.shape)
train_missing_values = train_new.isna().sum() # this shows the total no of missing values for each feature in the trainig dataset.
print(train_missing_values)

The above code shows there are 6447 missing entries in the reviewText column of the training dataset.The following code shows the removal of the rows with missing reviewText values.

In [None]:
train_dataset = train_new.dropna() #drop rows with missing values
print(train_dataset.isna().sum()) #Sanity check to see that all rows with missing reviewTexts are removed.
print(train_dataset.shape) #we can see the reduction in no of rows after removing the rows with mssing reviewText

**3.2) Checking for missing values in the test data and imputation**

In [None]:
print(test_new.isna().sum())
test_dataset = test_new
test_dataset['reviewText'] = test_dataset['reviewText'].fillna("unknown") #filling all the NaN values with "unknown"
#test_dataset['reviewText'] = test_dataset['reviewText'].fillna("neutral")
print(test_dataset.isna().sum())
print(test_dataset.shape)
print(test_dataset.info())

**3.3) Removing duplicate values from movies dataset**

In [None]:
movies = movies_new.drop_duplicates(subset= ['movieid']) #removing the duplicates from the movies file
print(movies_new.shape)

#movies = movies.fillna('')
print(movies.isna().sum()) 

There are missing values present in movies_new, which will be imputed in subsequent stages.

In [None]:
print(movies.head())
print(movies.shape) #to see that there are 143258 rows in movies.csv
print(movies.isna().sum()) # to see the missing values.we can see here that audienceScore has missing values.

**JOINING THE MOVIES DATA WITH THE DATA SET**


**a.First ,merge the movies data with train data**

In [None]:
train_dataset_merged = pd.merge(train_dataset,movies,on = 'movieid',how ='left')


In [None]:
print(train_dataset.shape)
print(train_dataset_merged.shape)

analyse the dataset: new train_dataset_merged

In [None]:
print(train_dataset_merged.iloc[0])
print(train_dataset_merged.shape)
print(train_dataset_merged.info())
print(train_dataset_merged.head())


**b. Now, merge the movies data with test data**

In [None]:
print(test_dataset.shape)
test_dataset_merged = pd.merge(test_dataset,movies,on = 'movieid',how ='left')
print(test_dataset_merged.shape)
test_dataset_merged = test_dataset_merged.drop(['movieid'],axis =1) #REMOVED movieid from test_dataset
print(test_dataset_merged.info())

In [None]:
train_dataset_merged = train_dataset_merged[['reviewText','audienceScore','sentiment']]
train_dataset_merged.head()

* Now we have **'Train_data_merged'** and **'Test_data_merged'**.These will have an additional feature called **'audienceScore'**,which we obtained(via merge) from the movies dataset.

* Initially ,models will be built with a single feature which is **reviewText**, the scope includes demonstrating the effect of adding additional features to the model.

* There will be some missing values in the audienceScore column,which will be rectified using the technique of **Imputation**.

* After the imputation we will devide the train dataset into train and validation set.Afterwards we can start data preprocessing.

**4.DATA CLEANING AND DATA PREPROCESSING**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
train_dataset_merged = train_dataset_merged[['reviewText','audienceScore','sentiment']] # REMOVED movieid from train data
train_dataset_merged.info()


In [None]:
X = train_dataset_merged[train_dataset_merged.columns[:2]]
y = train_dataset_merged[train_dataset_merged.columns[2]]
print(X.info())
print(y.info())

In [None]:
X.head()
#y.head()

**SPLITTING THE DATA INTO TRAIN AND EVALUATION SETS**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2 ,random_state =42)

In [None]:
print(y_train.head())
print(y_test.head())

In [None]:
#Encoding the label values in the data.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.fit_transform(y_test)

In [None]:
print(y_train_enc[0:5])
print(y_test_enc[0:5])

Now we have the following sets of data: **X_train,X_test,y_train,y_test** and **'test_dataset_merged'**.Now we need to clean and preprocess the data.

**4.1) IMPUTATION AND SCALING OF 'audienceScore' FEATURE**

In [None]:
#function to impute audienceScore with mean value.
def impute_audience_score(dataset, column_name):
    imputer = SimpleImputer(strategy='mean')
    dataset1 = pd.DataFrame(dataset['audienceScore'])
    audience_score = dataset1['audienceScore'].values.reshape(-1, 1)
    imputed_audience_score = imputer.fit_transform(audience_score)
    dataset['audienceScore'] = imputed_audience_score
    return dataset

In [None]:
#function to do standardise the values of audienceScore.
def scale_audience_score(dataset,column_name):
    
    scaler = StandardScaler()
    dataset['audienceScore'] = pd.DataFrame(dataset['audienceScore'])
    audience_score = dataset["audienceScore"].values.reshape(-1, 1)
    dataset['audienceScore'] = scaler.fit_transform(audience_score) 
    return dataset

In [None]:
from sklearn.preprocessing import FunctionTransformer

**5) INTRODUCING A PIPELINE (for scaling and imputing)**

In [None]:
#pipeline to do imputation and scaling.
pipeline = Pipeline([
    ('imputation', FunctionTransformer(impute_audience_score, validate=False, kw_args={'column_name': 'audienceScore'})),
    ('scaling', FunctionTransformer(scale_audience_score, validate=False, kw_args={'column_name': 'audienceScore'}))
])

Applying the pipeline to all three datasets:

In [None]:
X_train = pipeline.transform(X_train)

In [None]:
X_test = pipeline.transform(X_test)

In [None]:
test_dataset_merged = pipeline.transform(test_dataset_merged)

In [None]:
print(X_train['audienceScore'].isna().sum())
print(X_test['audienceScore'].isna().sum())
print(test_dataset_merged['audienceScore'].isna().sum())

In [None]:
test_dataset_merged['audienceScore'].info()

In [None]:
print(X_train.head())
print(type(X_train))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

**TEXT DATA PREPROCESSING - REMOVE INTEGERS AND VECTORIZE**

In [None]:
#Function to remove integers from the text.
import regex as re
def RemInteger(data):
    return re.sub(r'\d+', '', data)


In [None]:
#Applying the function to all three datasets.
X_train['reviewText'] = X_train['reviewText'].apply(RemInteger)
X_test['reviewText'] = X_test['reviewText'].apply(RemInteger)
test_dataset_merged['reviewText'] = test_dataset_merged['reviewText'].apply(RemInteger)

**6) TFID VECTORIZER**

Vectorise the the reviewText column of X_train, X_test and test_dataset_merged.

In [None]:
print(X_train.head())
print(X_test.head())
print(test_dataset_merged.head())

Removing special charecters , punctuations and numbers.

In [None]:
#function to remove special charecters and punctuations.
def REM_SPECIAL_CHAR(textdata):
    clean_text = re.sub(r'[^a-zA-Z\s]', '', textdata)
    return clean_text

**THE TRAIN,EVAL AND TEST SET**

In [None]:
#applying the function to all three datasets to remove special charecters and punctuations.
X_train['reviewText'] = X_train['reviewText'].apply(REM_SPECIAL_CHAR)
X_test['reviewText'] = X_test['reviewText'].apply(REM_SPECIAL_CHAR)
test_dataset_merged['reviewText'] = test_dataset_merged['reviewText'].apply(REM_SPECIAL_CHAR)

In [None]:
test_dataset_merged['reviewText'][20]

In [None]:
def custom_tokenizer(textdata):
    if isinstance(textdata, str):
        return textdata.lower().split()
    else:
        return []

In [None]:
custom_stop_words = [
   "the","thi","hi","wa","les","i","species","and", "a", "an", "in", "of", "to", "for", "on", "with",
    "this", "that", "it", "is", "as", "at", "by", "from", "about", "was",
    "were", "which", "who", "you", "your", "we", "our", "they", "them", "he", "his", "she", "her",
    "but", "or", "so", "not", "just", "like", "up", "down", "out", "more", "less",
    "movie", "film", "story", "plot", "character", "characters", "scene", "scenes"
]

Keeping multiple instances of vectorizer to see the effect of customised stop words,and tokenizer

In [None]:
#vectorize = TfidfVectorizer(stop_words = 'english',tokenizer = custom_tokenizer)

In [None]:
#vectorizer without custom tokenizer
#vectorize = TfidfVectorizer(stop_words = 'english')
vectorize = TfidfVectorizer(stop_words = 'custom_stop_words')

In [None]:
#Now that the data is clean let us use TFIDFvectorizer on the datasets to vectorize the text data.
VectReviewText_X_train = vectorize.fit_transform(X_train['reviewText'])

In [None]:
test_dataset_merged['reviewText'].info()

In [None]:
VectReviewText_X_test = vectorize.transform(X_test['reviewText'])
VectReviewText_testdata = vectorize.transform(test_dataset_merged['reviewText'])

**MODEL TRAINING**

**BASELINE MODEL -LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 500,random_state =42)

In [None]:
#lr.fit(VectReviewText_X_train,y_train)

In [None]:
#y_pred = lr.predict(VectReviewText_X_test)
#y_pred[3:5]

In [None]:
#from sklearn.metrics import accuracy_score
#accuracy = accuracy_score(y_test,y_pred)
#accuracy

In [None]:
#test_prediction = lr.predict(VectReviewText_testdata_merged)


In [None]:
#submission = pd.DataFrame(columns= ['id','sentiment'])
#submission['id'] = [i for i in range(len(test_prediction))]
#submission['sentiment'] = test_prediction

In [None]:
#submission.head()

In [None]:
# SUBMISSION OF LOGISTIC REGRESSION
#submission.to_csv('submission.csv',index = False)

**The submission score was .79260 for logistic regression**

**SUPPORT VECTOR MACHINE - LinearSVC  (MODEL-1)**

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf = LinearSVC(loss = 'hinge',max_iter=10000)
#clf = LinearSVC(max_iter=10000)
#clf.fit(VectReviewText_X_train, y_train)

In [None]:
#y_pred = clf.predict(VectReviewText_X_test)
#y_pred[3:5]

In [None]:
#from sklearn.metrics import accuracy_score
#accuracy = accuracy_score(y_test,y_pred)
#print(accuracy)

In [None]:
#from sklearn.metrics import f1_score
#f1_micro = f1_score(y_test, y_pred, average='micro')
#print(f1_micro)

In [None]:
#test_prediction = clf.predict(VectReviewText_testdata_merged)

In [None]:
#submission = pd.DataFrame(columns= ['id','sentiment'])
#submission['id'] = [i for i in range(len(test_prediction))]
#submission['sentiment'] = test_prediction

In [None]:
#submission.head()

In [None]:
#SUBMISSION OF SUPPORT VECTOR MACHINE
#save the submission dataframe into .csv file
#submission.to_csv('submission.csv',index = False)

**With LinearSVC we got 0.79287 score which is an imporvement from the Logistic regression model with score 0.79260**

**DOING HYPERPARAMETER TUNING FOR LINEAR SVC**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
param_dist = {'C': uniform(loc=0, scale=5)}
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, n_iter=10,random_state =42)
random_search.fit(VectReviewText_X_train, y_train)

In [None]:
best_params = random_search.best_params_
print(best_params)

In [None]:
best_params = random_search.best_params_
print(best_params)#{'C': 0.7800932022121826}

In [None]:
#best_clf = LinearSVC(max_iter=10000, **best_params)
best_clf = LinearSVC(max_iter=10000,loss = 'hinge', **best_params)

In [None]:
best_clf.fit(VectReviewText_X_train, y_train)

In [None]:
y_pred = best_clf.predict(VectReviewText_X_test)
y_pred[3:5]

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

In [None]:
from sklearn.metrics import f1_score
f1_micro = f1_score(y_test, y_pred, average='micro')
print(f1_micro) #test score of 0.804

In [None]:
test_prediction = best_clf.predict(VectReviewText_testdata)

In [None]:
print(best_clf.predict(VectReviewText_testdata[16]))
(test_dataset_merged['reviewText'][16])

In [None]:
submission = pd.DataFrame(columns= ['id','sentiment'])
submission['id'] = [i for i in range(len(test_prediction))]
submission['sentiment'] = test_prediction

In [None]:
submission.head()

In [None]:
#save the submission dataframe into .csv file
submission.to_csv('submission.csv',index = False)

**The submission score for LinearSVC after HPT was 0.79311 which is an improvement from the score without HPT ,which was 0.79287.**

Note : Linear svc with

**CART- MODEL-2**

In [None]:
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.metrics import accuracy_score
#cart = DecisionTreeClassifier()

In [None]:
#cart.fit(VectReviewText_X_train, y_train)

In [None]:
#y_pred_cart = cart.predict(VectReviewText_X_test)
#y_pred_cart[3:5]

In [None]:
#accuracy_cart = accuracy_score(y_test,y_pred_cart)
#accuracy_cart

In [None]:
#from sklearn.metrics import f1_score
#f1_micro = f1_score(y_test, y_pred_cart, average='micro')
#print(f1_micro)

**CART -HYPERPARAMETER TUNING**

In [None]:
#from sklearn.model_selection import RandomizedSearchCV

In [None]:
#param_grid = {'criterion' :['gini', 'entropy','log_loss']}

In [None]:
#random_search = RandomizedSearchCV(cart, param_grid, cv=2)

In [None]:
random_search.fit(VectReviewText_X_train, y_train)

In [None]:
#best_params_cart = random_search.best_params_
#best_params_cart

In [None]:
#cart_best = random_search.best_estimator_
#y_pred_cart = cart_best.predict(VectReviewText_X_test)

In [None]:
#accuracy_cart = accuracy_score(y_test,y_pred_cart)
#accuracy_cart

**K NEAREST NEIGHBOR CLASSIFIER -MODEL 3**

In [None]:
#from sklearn.neighbors import KNeighborsClassifier

In [None]:
#knn = KNeighborsClassifier(n_neighbors=5,algorithm = "ball_tree")

In [None]:
#knn.fit(VectReviewText_X_train,y_train)

In [None]:
#y_pred_knn = knn.predict(VectReviewText_X_test)

In [None]:
#y_pred_knn[3:5]

In [None]:
#from sklearn.metrics import accuracy_score
#accuracy = accuracy_score(y_test,y_pred_knn)
#accuracy

**K-NEAREST NEIGHBOR- HYPERPARAMETER TUNING**

In [None]:
#from sklearn.model_selection import RandomizedSearchCV

In [None]:
#param_dist = {'n_neighbors': np.arange(1,10),'weights': ['uniform', 'distance'],'p': [1, 2]}

In [None]:
#random_search = RandomizedSearchCV(knn_classifier, param_distributions=param_dist,n_iter=100,cv=5,n_jobs=-1,random_state=42)

In [None]:
#random_search.fit(VectReviewText_X_train,y_train)

In [None]:
#best_params_cart = random_search.best_params_
#best_params_cart

In [None]:
#y_pred_knn = knn.predict(VectReviewText_X_test)

In [None]:
#knn_best = random_search.best_estimator_
#y_pred_knn = knn_best.predict(VectReviewText_X_test)

In [None]:
#from sklearn.metrics import f1_score
#f1_micro = f1_score(y_test, y_pred_knn, average='micro')
#print(f1_micro)

**XGBOOST MODEL - MODEL-4**

XGBoost models use ensemble technique making it a good choice for classification algorithm.

In [None]:
#XGBOOST
#import xgboost as xgb
#model = xgb.XGBClassifier()
#model.fit(VectReviewText_X_train,y_train_enc)
#y_pred_xgb = model.predict(VectReviewText_X_test)
#from sklearn.metrics import f1_score
#f1_micro = f1_score(y_test_enc, y_pred_xgb, average='micro')
#print(f1_micro) #0.85

In [None]:
#import xgboost as xgb
#from sklearn.model_selection import RandomizedSearchCV
#model = xgb.XGBClassifier()
#param_grid_xgb= {
 #   'n_estimators': [100, 200, 300],
  #  'learning_rate': [0.01, 0.1, 0.2],
   # 'max_depth': [3, 5, 7],'subsample': [0.8, 0.9, 1.0],
    #'colsample_bytree': [0.8, 0.9, 1.0]
#}

In [None]:
#rand_search_xgb = RandomizedSearchCV(estimator=model, param_distributions=param_grid_xgb, n_iter=10, scoring='f1_micro', cv=3, random_state=42)


In [None]:
#rand_search_xgb.fit(VectReviewText_X_train, y_train_enc)

In [None]:
#best_params = rand_search_xgb.best_params_
#best_params #{'subsample': 0.9, 'n_estimators': 200,'max_depth': 7,'learning_rate': 0.2, 'colsample_bytree': 1.0}

In [None]:
#best_model_xgb = xgb.XGBClassifier(**best_params)
#best_model_xgb = xgb.XGBClassifier(subsample=0.9,n_estimators=200,max_depth=7,learning_rate=0.2,colsample_bytree=1.0)
#best_model_xgb.fit(VectReviewText_X_train,y_train_enc)

In [None]:
#from sklearn.metrics import f1_score
#y_pred_xgb_best = best_model_xgb.predict(VectReviewText_X_test)
#f1_micro_best = f1_score(y_test_enc, y_pred_xgb_best, average='micro')
#print(f1_micro_best) #eval score was 0.86402 for XGBoost

In [None]:
#test_prediction = best_model_xgb.predict(VectReviewText_testdata_merged)


In [None]:
#submission = pd.DataFrame(columns= ['id','sentiment'])
#submission['id'] = [i for i in range(len(test_prediction))]
#submission['sentiment'] = test_prediction
#submission.head()


In [None]:
#submission.info()

In [None]:
#submission.loc[submission['sentiment'] == 1, 'sentiment'] = 'POSITIVE'
#submission.loc[submission['sentiment'] == 0, 'sentiment'] = 'NEGATIVE'
#submission.head()

In [None]:
#save the submission dataframe into .csv file
#submission.to_csv('submission.csv',index = False)

**COMPARISON OF MODELS**

* Till now I have trained the following models; 
* 1. LogisticRegression (Baseline ) (score= 0.79264)
* 2. LinearSVC (0.79314)
* 3. DecisionTreeClassifier(0.6986)
* 4. K nearest neighbors classifier.(0.6876)
* 5. XGBoost(submission score 0.7433)

* Out of these ,so far Linear SVC yields the **highest score of 0.79314**.The score of LinearSVC where observed to be improved after hyperparameter tuning.LogisticRegression comes second with a score of 0.79264.XGBoost yielded a score of 0.7433.The least test scores where given by DecisionTreeClassifier(0.6986) and K nearest neighbors classifier(0.6876).


**SCOPE AND LIMITATIONS**

Further, more features can be incorporated in the model to assess the performance. The features with better importance can be retained in the best model. Along with reviewText features with missing values can be imputed and scaled. Categorical features can be encoded.Below is the attempt to explore how features like 'audienceScore','runtiemMinutes','genre','originalLanguage','reviewerName' affect the sentiment prediction. These were the features with majority of the entries as non null values.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2 ,random_state =42)
#Imputing categorical variables in X_train 
missing_names = X_train['reviewerName'].isnull()
missing_genre = X_train['genre'].isnull()
missing_lang = X_train['originalLanguage'].isnull()
X_train.loc[missing_names, 'reviewerName'] = "unknown"
X_train.loc[missing_genre, 'genre'] = "unknown"
X_train.loc[missing_lang, 'originalLanguage'] = "unknown"
X_train.info()

X_test.info() #we can see that audience score,runtimeminutes,genre and original language has null values
test_dataset_merged.info() #we can see that audience score,runtimeminutes,genre and original language has null values



#Imputing categorical variables in X_test and test_dataset_merged 
missing_names = X_test['reviewerName'].isnull()
missing_genre = X_test['genre'].isnull()
missing_lang = X_test['originalLanguage'].isnull()
X_test.loc[missing_names, 'reviewerName'] = "unknown"
X_test.loc[missing_genre, 'genre'] = "unknown"
X_test.loc[missing_lang, 'originalLanguage'] = "unknown"
print(X_test.info())


missing_names = test_dataset_merged ['reviewerName'].isnull()
missing_genre = test_dataset_merged ['genre'].isnull()
missing_lang = test_dataset_merged ['originalLanguage'].isnull()
test_dataset_merged .loc[missing_names, 'reviewerName'] = "unknown"
test_dataset_merged .loc[missing_genre, 'genre'] = "unknown"
test_dataset_merged .loc[missing_lang, 'originalLanguage'] = "unknown"
print(test_dataset_merged .info())

In [None]:
def impute_feature(dataset, column_name):
    imputer = SimpleImputer(strategy='mean')
    dataset1 = pd.DataFrame(dataset[column_name])
    feature = dataset1[column_name].values.reshape(-1, 1)
    imputed_feature = imputer.fit_transform(feature)
    dataset[column_name] = imputed_feature
    return dataset

In [None]:
impute_feature(X_train,'runtimeMinutes')
impute_feature(X_train,'audienceScore')
impute_feature(X_test,'runtimeMinutes')
impute_feature(X_test,'audienceScore')
impute_feature(test_dataset_merged,'runtimeMinutes')
impute_feature(test_dataset_merged,'audienceScore')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
numeric_columns = ['audienceScore', 'runtimeMinutes']
categorical_columns = ['genre', 'originalLanguage', 'reviewerName']
text_column = ['reviewText']

In [None]:
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
text_transformer = TfidfVectorizer()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),  # Apply numeric_transformer to numeric_columns
        ('cat', categorical_transformer, categorical_columns),  # Apply categorical_transformer to categorical_columns
        ('text', text_transformer, text_column)  # Apply text_transformer to text_column
    ],
    remainder='passthrough'  # Pass through any columns not specified above
)

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.fit_transform(X_train)
test_dataset_transformed = preprocessor.fit_transform(X_train)

In [None]:
svm_model = LinearSVC()
svm_model.fit(X_train_transformed, y_train)
y_pred = svm_model.predict(X_test_transformed)

In [None]:
from sklearn.metrics import f1_score
#f1_micro_best = f1_score(y_test, y_pred, average='micro')
#print(f1_micro_best) 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
param_dist = {'C': uniform(loc=0, scale=5)}
random_search = RandomizedSearchCV(svm_model, param_distributions=param_dist, cv=5, n_iter=10,random_state =42)
random_search.fit(X_train_transformed, y_train)

In [None]:
best_params = random_search.best_params_
print(best_params)

In [None]:
best_clf = LinearSVC(max_iter=10000,loss = 'hinge', **best_params)
best_clf.fit(X_train_transformed, y_train)
y_pred = best_clf.predict(X_test_transformed)