In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing libraries

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, auc,confusion_matrix,make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading

In [None]:
train_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv")
test_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")
movies_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")

# Data Information

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.info())
print(train_data.isna().sum())

In [None]:
print(test_data.info())
print(test_data.isna().sum())

In [None]:
movies_data.head()

In [None]:
print(movies_data.info())
print(movies_data.isna().sum())

# Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

x=train_data.drop(['sentiment','movieid','reviewerName','isFrequentReviewer'],axis=1)
y=train_data.sentiment


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model=DummyClassifier(strategy='most_frequent')
model=model.fit(x_train,y_train)

y_pred=model.predict(x_test)
score=accuracy_score(y_test,y_pred)
score

# Data Preprocessing

> #### Checking if "isTopCritic" from test dataset is same as "isFrequentReviewer" from train dataset

In [None]:
freq=train_data.drop(['movieid','reviewText','sentiment'],axis=1).drop_duplicates(subset='reviewerName')
top=test_data.drop(['movieid','reviewText'],axis=1).drop_duplicates(subset='reviewerName')


d_freq={}
for i in freq.index:
    d_freq[freq['reviewerName'][i]]=freq['isFrequentReviewer'][i]

d_top={}
for i in top.index:
    d_top[top['reviewerName'][i]]=top['isTopCritic'][i]

total=0
same=0

for key in d_freq:
    if key in d_top:
        if d_freq[key]==d_top[key]:
            same+=1
        total+=1
        
print("Percentage of identical datas is",(same/total)*100)

> #### Merging the **'train_data'** and **'movies_data'**

In [None]:
# Adding audienceScore in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['audienceScore'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'audienceScore']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'audienceScore']=''

# Adding originalLanguage in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['originalLanguage'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'originalLanguage']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'originalLanguage']=''
        
# Adding runtimeMinutes in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['runtimeMinutes'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'runtimeMinutes']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'runtimeMinutes']=''

# Adding distributor in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['distributor'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'distributor']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'distributor']=''

# Adding boxOffice in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['boxOffice'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'boxOffice']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'boxOffice']=''
        
# Adding director in train_data
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['director'][i]

for i in train_data.index:
    if train_data['movieid'][i] in d_movies:
        train_data.loc[i,'director']=d_movies[train_data['movieid'][i]]
    else:
        train_data.loc[i,'director']=''
        

In [None]:
print(train_data.info())
print(train_data.isna().sum())
train_data.head()

> #### Preprocessing the boxOffice data

In [None]:
for i in train_data.index:
    if type(train_data['boxOffice'][i])!=float:
        train_data.loc[i,'boxOffice']=train_data.loc[i,'boxOffice'][1:]

train_data['money']=train_data['boxOffice']
for i in train_data.index:
    if type(train_data['boxOffice'][i])!=float:
        train_data.loc[i,'money']=train_data.loc[i,'boxOffice'][-1]
        train_data.loc[i,'boxOffice']=train_data.loc[i,'boxOffice'][:-1]

for i in train_data['boxOffice'].index:
    if train_data['boxOffice'][i]=='':
        train_data['boxOffice'][i]=np.nan
        

train_data['boxOffice']=train_data['boxOffice'].astype(float)        
type(train_data['boxOffice'][1])

train_data[['boxOffice','money']]

In [None]:

for i in train_data.index:
    if train_data['money'][i]=='K':
        train_data.loc[i,'boxOffice']=train_data.loc[i,'boxOffice']*1000
    elif train_data['money'][i]=='M':
        train_data.loc[i,'boxOffice']=train_data.loc[i,'boxOffice']*1000000
    else:
        train_data.loc[i,'boxOffice']=(train_data.loc[i,'boxOffice']*10)+float(train_data['money'][i])

train_data['boxOffice'].tail()


# Exploratory Data Analysis

In [None]:
train_data=train_data.drop('money',axis=1)
train_data.describe(include='all').T

In [None]:
print((train_data.isna().sum()/train_data.shape[0])*100)

In [None]:
num_data=['audienceScore','runtimeMinutes','boxOffice']
cat_data=['reviewerName','isFrequentReviewer','originalLanguage','distributor','director','sentiment']

### Univariate Analysis for Numerical Data

In [None]:
for col in num_data:
    print(col)
    print('Skew :', round(train_data[col].skew(), 1))
    plt.figure(figsize = (15, 4))
    plt.subplot(1, 2, 1)
    train_data[col].hist(grid=False)
    plt.ylabel('count')
    plt.subplot(1, 2, 2)
    sns.boxplot(x=train_data[col])
    plt.show()

### Univariate Analysis for categorical data

In [None]:
fig, axes = plt.subplots(3, 2, figsize = (18, 18))
fig.suptitle('Bar plot for all categorical variables in the dataset')
sns.countplot(ax = axes[0, 0], x = 'sentiment', data = train_data, color = 'blue',order = train_data['sentiment'].value_counts().index);
sns.countplot(ax = axes[0, 1], x = 'isFrequentReviewer', data = train_data, color = 'blue', order = train_data['isFrequentReviewer'].value_counts().index);
sns.countplot(ax = axes[1, 0], x = 'originalLanguage', data = train_data, color = 'blue', order = train_data['originalLanguage'].head(10).value_counts().index);
sns.countplot(ax = axes[1, 1], x = 'distributor', data = train_data, color = 'blue', order = train_data['distributor'].head(20).value_counts().index);
sns.countplot(ax = axes[2, 0], x = 'director', data = train_data, color = 'blue', order = train_data['director'].head(20).value_counts().index);
sns.countplot(ax = axes[2, 1], x = 'reviewerName', data = train_data, color = 'blue', order = train_data['reviewerName'].head(20).value_counts().index);
axes[1][1].tick_params(labelrotation=45);
axes[2][0].tick_params(labelrotation=90);
axes[2][1].tick_params(labelrotation=90);

### Bivariate Analysis 

In [None]:
enc=LabelEncoder()
data_=train_data.drop(['reviewerName','isFrequentReviewer','originalLanguage','distributor','director'],axis=1)
data_['sentiment']=enc.fit_transform(data_['sentiment'])
plt.figure(figsize=(13,17))
sns.pairplot(data=data_)
plt.show()

## Heat Map 

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(data_.corr(), annot = True, vmin = -1, vmax = 1)
plt.show()

In [None]:

train_data[['reviewText','originalLanguage','distributor','director']]=train_data[['reviewText','originalLanguage','distributor','director']].fillna('')

imputer=SimpleImputer(strategy='mean')
train_data[['audienceScore','runtimeMinutes']]=imputer.fit_transform(train_data[['audienceScore','runtimeMinutes']])
train_data.isna().sum()

## Cleaning the reviewText

In [None]:
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

for key in contractions_dict:
    train_data['reviewText'] = train_data['reviewText'].str.replace(key,contractions_dict[key])
    
for i in train_data.index:
    if type(train_data['reviewText'][i])!=float:
        train_data.loc[i,'reviewText']=train_data['reviewText'][i].lower()

> #### Scaling the Numerical data and merging the text data

In [None]:

train_data['merge']=train_data['originalLanguage']+' '+train_data['distributor']+' '+train_data['director']+' '+train_data['reviewerName']
train_data=train_data.drop(['movieid','reviewerName','originalLanguage','director','distributor','boxOffice','runtimeMinutes'],axis=1)

scaler_1=StandardScaler()
scaler_2=MinMaxScaler()
l_encoder=LabelEncoder()

train_data['audienceScore']=scaler_1.fit_transform(train_data[['audienceScore']])
train_data['isFrequentReviewer']=l_encoder.fit_transform(train_data['isFrequentReviewer'])
train_data.head()

In [None]:
x=train_data.drop(['sentiment'],axis=1)
y=train_data['sentiment']

print(y)
x.head()

1. ### splitting the data into training dataset and testing dataset
2. ### Further converting them to sparse matrix 
3. ### Vectorizing the text data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33 )

vect_1=TfidfVectorizer(ngram_range=(1,3))
vect_2=TfidfVectorizer(ngram_range=(1,2))

x_train_0=vect_1.fit_transform(x_train['reviewText'])
x_test_0=vect_1.transform(x_test['reviewText'])

merge_train=vect_2.fit_transform(x_train['merge'])
merge_test=vect_2.transform(x_test['merge'])

In [None]:
from scipy.sparse import hstack, csr_matrix, vstack

x_train_1=np.array(x_train['audienceScore'])
x_train_1=csr_matrix(x_train_1.reshape(-1,1))

x_train_4=np.array(x_train['isFrequentReviewer'])
x_train_4=csr_matrix(x_train_4.reshape(-1,1))

x_train=hstack((x_train_0,merge_train,x_train_1,x_train_4))
x_train.shape

In [None]:
x_test_1=np.array(x_test['audienceScore'])
x_test_1=csr_matrix(x_test_1.reshape(-1,1))

x_test_4=np.array(x_test['isFrequentReviewer'])
x_test_4=csr_matrix(x_test_4.reshape(-1,1))

x_test=hstack((x_test_0,merge_test,x_test_1,x_test_4))
x_test.shape

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

x=vstack([x_train,x_test])
y=pd.concat([y_train,y_test],ignore_index=True)
k = 120000
selector = SelectKBest(score_func=f_classif, k=k)
train_top_k = selector.fit_transform(x, y)

print(train_top_k.shape)

#### Further, Splitting the selected features into train and test dataset

In [None]:
x_train, x_test, y_train, y_test=train_test_split(train_top_k, y, test_size=0.2, random_state=13 )

## Doing data processing for given test dataset

In [None]:
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['audienceScore'][i]

ls_audscore=[]
for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        ls_audscore.append(d_movies[test_data['movieid'][i]])
    else:
        ls_audscore.append('')
        
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['originalLanguage'][i]

ls_originalLanguage=[]
for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        ls_originalLanguage.append(d_movies[test_data['movieid'][i]])
    else:
        ls_originalLanguage.append('')
        
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['runtimeMinutes'][i]

ls_runtimeMinutes=[]
for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        ls_runtimeMinutes.append(d_movies[test_data['movieid'][i]])
    else:
        ls_runtimeMinutes.append('')

d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['distributor'][i]

ls_distributor=[]
for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        ls_distributor.append(d_movies[test_data['movieid'][i]])
    else:
        ls_distributor.append('')

d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['boxOffice'][i]

ls_boxOffice=[]
for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        ls_boxOffice.append(d_movies[test_data['movieid'][i]])
    else:
        ls_boxOffice.append('')
        
d_movies={}
for i in movies_data.index:
    d_movies[movies_data['movieid'][i]]=movies_data['director'][i]

for i in test_data.index:
    if test_data['movieid'][i] in d_movies:
        test_data.loc[i,'director']=d_movies[test_data['movieid'][i]]
    else:
        test_data.loc[i,'director']=''
        

test_data['audienceScore']=ls_audscore
test_data['originalLanguage']=ls_originalLanguage
test_data['runtimeMinutes']=ls_runtimeMinutes
test_data['distributor']=ls_distributor
test_data['boxOffice']=ls_boxOffice

test_data.isna().sum()

test_data.info()

test_data[['reviewText','originalLanguage','distributor','director']]=test_data[['reviewText','originalLanguage','distributor','director']].fillna('')

test_data.isna().sum()

imputer=SimpleImputer(strategy='mean')

test_data[['audienceScore','runtimeMinutes']]=imputer.fit_transform(test_data[['audienceScore','runtimeMinutes']])
test_data.isna().sum()

print(test_data['boxOffice'][0])

for i in test_data.index:
    if type(test_data['boxOffice'][i])!=float:
        test_data['boxOffice'][i]=test_data['boxOffice'][i][1:]

test_data['money']=test_data['boxOffice']
for i in test_data.index:
    if type(test_data['boxOffice'][i])!=float:
        test_data.loc[i,'money']=test_data.loc[i,'boxOffice'][-1]
        test_data.loc[i,'boxOffice']=test_data.loc[i,'boxOffice'][:-1]

for i in test_data['boxOffice'].index:
    if test_data['boxOffice'][i]=='':
        test_data['boxOffice'][i]=np.nan
        

test_data['boxOffice']=test_data['boxOffice'].astype(float)        
type(test_data['boxOffice'][1])

test_data['boxOffice']

for i in test_data.index:
    if test_data['money'][i]=='K':
        test_data.loc[i,'boxOffice']=test_data.loc[i,'boxOffice']*1000
    elif test_data['money'][i]=='M':
        test_data.loc[i,'boxOffice']=test_data.loc[i,'boxOffice']*1000000

test_data['boxOffice'].tail()

test_data['boxOffice']=test_data['boxOffice'].fillna(25000000)

test_data=test_data.drop('money',axis=1)

In [None]:
test_data.head()

In [None]:
for key in contractions_dict:
    test_data['reviewText'] = test_data['reviewText'].str.replace(key,contractions_dict[key])
    
for i in test_data.index:
    if type(test_data['reviewText'][i])!=float:
        test_data.loc[i,'reviewText']=test_data['reviewText'][i].lower()

test_data['merge']=test_data['originalLanguage']+' '+test_data['distributor']+' '+test_data['director']+' '+test_data['reviewerName']
test_data=test_data.drop(['movieid','reviewerName','originalLanguage','director','distributor'],axis=1)

scaler_1=StandardScaler()
scaler_2=MinMaxScaler()
l_encoder=LabelEncoder()

test_data['audienceScore']=scaler_1.fit_transform(test_data[['audienceScore']])
test_data['runtimeMinutes']=scaler_2.fit_transform(test_data[['runtimeMinutes']])
test_data['boxOffice']=scaler_1.fit_transform(test_data[['boxOffice']])
test_data['isTopCritic']=l_encoder.fit_transform(test_data['isTopCritic'])



test_0=vect_1.transform(test_data['reviewText'])

merge_test=vect_2.transform(test_data['merge'])

from scipy.sparse import hstack, csr_matrix

test_1=np.array(test_data['audienceScore'])
test_1=csr_matrix(test_1.reshape(-1,1))

test_2=np.array(test_data['runtimeMinutes'])
test_2=csr_matrix(test_2.reshape(-1,1))

test_3=np.array(test_data['boxOffice'])
test_3=csr_matrix(test_3.reshape(-1,1))

test_4=np.array(test_data['isTopCritic'])
test_4=csr_matrix(test_4.reshape(-1,1))

test=hstack((test_0,merge_test,test_1,test_4))
test.shape

test=selector.transform(test)

test.shape

# Logistic Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(max_iter=1000)

classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
score_1=accuracy_score(y_test,y_pred)
print("With LogisticRegression score is",score_1)
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))



### Hypertunnning the model

In [None]:
lr_clf=LogisticRegression(max_iter=1000)
param_dist={'C':[0.01,0.1,1,10],'solver':['liblinear','sag'],'penalty':['l1','l2']}

random_search=RandomizedSearchCV(lr_clf,param_dist,cv=5,scoring=make_scorer(accuracy_score))

random_search.fit(x_train,y_train)
y_pred=random_search.predict(x_test)
score=accuracy_score(y_pred,y_test)
print("Score after hypertunning the model",score)

print(random_search.best_params_)
print(random_search.best_estimator_)

In [None]:
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))

## ROC Curve for LogisticRegression

In [None]:
enc=LabelEncoder()
y_true=enc.fit_transform(y_test)
y_score=enc.transform(y_pred)

fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

print(roc_auc_score(y_true, y_score))

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
score_knn=accuracy_score(y_test,y_pred)

print("With KNN score is",score_knn)
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))

### ROC Curve for KNN

In [None]:
y_true=enc.fit_transform(y_test)
y_score=enc.transform(y_pred)

fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

print(roc_auc_score(y_true, y_score))

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# SVM Classifier

> ### SVC

In [None]:
from sklearn.svm import SVC

svm=SVC()
svm.fit(x_train,y_train)
y_pred=svm.predict(x_test)
score_svm=accuracy_score(y_test,y_pred)

print("With SVC score is",score_svm)
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))

## ROC Curve for SVC

In [None]:
y_true=enc.fit_transform(y_test)
y_score=enc.transform(y_pred)

fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

print(roc_auc_score(y_true, y_score))

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

> ## LinearSVC

In [None]:
from sklearn.svm import LinearSVC

param_dist = {'penalty': ['l1', 'l2'], 'loss': ['hinge', 'squared_hinge'], 'C': [0.1,1.2], 'dual': [False]}

clf = LinearSVC()

random_search_svm = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=50, cv=5, random_state=42)
random_search_svm.fit(x_train, y_train)
y_pred=random_search_svm.predict(x_test)
score=accuracy_score(y_test,y_pred)
print("score after hypertunning the model is",score)
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))

print(random_search_svm.best_params_)
print(random_search_svm.best_estimator_)

## ROC Curve for LinearSVC

In [None]:
y_true=enc.fit_transform(y_test)
y_score=enc.transform(y_pred)

fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

print(roc_auc_score(y_true, y_score))

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Boosting Algorithm

In [None]:
from sklearn.ensemble import AdaBoostClassifier

bos_clf=AdaBoostClassifier()
bos_clf.fit(x_train,y_train)
y_pred=bos_clf.predict(x_test)
score_bos=accuracy_score(y_test,y_pred)

print("With AdaBoostClassifier score is",score_bos)
print("\n",classification_report(y_test,y_pred))
print("\n",confusion_matrix(y_test,y_pred))

## ROC Curve for AdaBoostClassifier

In [None]:
y_true=enc.fit_transform(y_test)
y_score=enc.transform(y_pred)

fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

print(roc_auc_score(y_true, y_score))

# Plotting the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Generating predicted label

In [None]:
sub=pd.DataFrame(random_search.predict(test), columns=['sentiment'])
sub.index.name='id'
sub.to_csv("submission.csv")

output=pd.read_csv("submission.csv",)
output