### Sentiment Analysis

In [2]:
import pandas as pd
import nltk
import os
import string
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from wordcloud import WordCloud
import xgboost as xgb
import gensim
import numpy as np



In [3]:
data_df=pd.DataFrame()
for dirname,_,filenames in os.walk("C:\\Users\\maria\\Desktop\\sentiment labelled sentences"):
    for filename in filenames:
        if(filename.endswith(".txt")):
            data_df = data_df.append(pd.read_csv(os.path.join(dirname,filename),delimiter='\t',header=None,names=["Review","Sentiment"]))
        

In [4]:
len(data_df)

2769

In [5]:
data_df.reset_index(drop="True",inplace=True)

In [6]:
data_df['Review']=data_df['Review'].str.lower()

In [7]:
data_df.head()

Unnamed: 0,Review,Sentiment
0,so there is no way for me to plug it in here i...,0.0
1,"good case, excellent value.",1.0
2,great for the jawbone.,1.0
3,tied to charger for conversations lasting more...,0.0
4,the mic is great.,1.0


### Eliminating Punctuations

In [8]:
data_df['Review']=data_df['Review'].str.replace("[^a-zA-z]"," ")

### Removing short words

In [9]:
data_df['Review']=data_df['Review'].apply(lambda x:' '.join([words for words in x.split() if len(words)>=3]))

In [10]:
data_df.head()

Unnamed: 0,Review,Sentiment
0,there way for plug here the unless converter,0.0
1,good case excellent value,1.0
2,great for the jawbone,1.0
3,tied charger for conversations lasting more th...,0.0
4,the mic great,1.0


### Stopword Removal

In [11]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
data_df['Review']=data_df['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
data_df.head()

Unnamed: 0,Review,Sentiment
0,way plug unless converter,0.0
1,good case excellent value,1.0
2,great jawbone,1.0
3,tied charger conversations lasting minutes maj...,0.0
4,mic great,1.0


### Stemming and Lemmatization

In [13]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
lemma=WordNetLemmatizer()
stemmer=SnowballStemmer('english')

data_df['Review']=data_df['Review'].apply(lambda x: ' '.join([stemmer.stem(lemma.lemmatize(word)) for word in x.split()]))
data_df.head()


Unnamed: 0,Review,Sentiment
0,way plug unless convert,0.0
1,good case excel valu,1.0
2,great jawbon,1.0
3,tie charger convers last minut major problem,0.0
4,mic great,1.0


### Word Cloud

In [15]:
all_words = ' '.join([words for words in data_df['Review']])
len(all_words)

109112

In [16]:
positive_words = ' '.join([words for words in data_df[data_df['Sentiment']==1]['Review']])
len(positive_words)

54333

In [17]:
negative_words = ' '.join([words for words in data_df[data_df['Sentiment']==0]['Review']])
len(negative_words)

54128

In [18]:

wordcloud = WordCloud(width=1000,height=500,random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

NameError: name 'plt' is not defined

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt  
wordcloud = WordCloud(width=1000, height=500, random_state=21, max_font_size=110).generate(positive_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud) 
plt.axis('off')
plt.show()


In [None]:
wordcloud=WordCloud(width=1000, height=500, random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Eliminate rare and frequently occuring words 

### Word2Vec

In [19]:
from gensim import models
tokenized = data_df['Review'].apply(lambda x: x.split())
model_w2v = models.Word2Vec(tokenized,min_count=2,vector_size=500,alpha=0.03, 
                     min_alpha=0.0007)
model_w2v.build_vocab(tokenized,progress_per=10000)
model_w2v.train(tokenized, total_examples=model_w2v.corpus_count, epochs=30, report_delay=1)

(424433, 537600)

In [20]:
model_w2v.wv.most_similar(positive=['terribl'],topn=25)

[('slow', 0.99055415391922),
 ('littl', 0.9891200065612793),
 ('mediocr', 0.9882552027702332),
 ('serv', 0.9859213829040527),
 ('attitud', 0.9858940243721008),
 ('fantast', 0.9858649969100952),
 ('non', 0.9854453802108765),
 ('prepar', 0.9854159951210022),
 ('apolog', 0.9853435158729553),
 ('basic', 0.9850645065307617),
 ('toward', 0.9847434759140015),
 ('extrem', 0.9846440553665161),
 ('talk', 0.9846248030662537),
 ('charisma', 0.9842594265937805),
 ('atroci', 0.9842436909675598),
 ('focus', 0.9838920831680298),
 ('pace', 0.9837925434112549),
 ('par', 0.9837725162506104),
 ('pay', 0.9837449193000793),
 ('pass', 0.9837180376052856),
 ('sprint', 0.983699381351471),
 ('meh', 0.9836912155151367),
 ('aw', 0.9836273789405823),
 ('averag', 0.9834861159324646),
 ('delici', 0.9834702014923096)]

In [21]:
model_w2v.wv.similarity('horribl','terribl')

0.9700787

In [22]:
model_w2v.wv.doesnt_match(['good','horribl','terribl'])

'good'

In [23]:
model_w2v.wv['good']

array([ 1.74120843e-01,  1.74319312e-01,  1.43234730e-01,  1.59020826e-01,
       -2.49292124e-02, -1.42647564e-01, -8.55142530e-03,  1.99307442e-01,
        4.58303839e-02,  4.19029444e-02, -3.36616822e-02,  7.93639719e-02,
        2.72114333e-02,  3.66062298e-02,  4.22327220e-02, -1.40021607e-01,
       -1.70206085e-01, -8.25625472e-03,  1.27335787e-02,  3.91234607e-02,
        1.14053033e-01, -5.68280555e-02,  1.16813660e-01, -3.92387956e-02,
        9.80603471e-02,  5.48830293e-02,  7.10215196e-02,  4.38689766e-03,
       -2.44485706e-01, -1.77769677e-03,  4.31259274e-02, -3.47218290e-02,
       -9.08817425e-02, -4.77706976e-02,  1.72455460e-01,  9.39952806e-02,
        2.70367004e-02, -1.35579303e-01, -4.81237993e-02, -1.45756096e-01,
        1.11813666e-02, -5.94368950e-02, -1.68569475e-01,  1.01409614e-01,
       -9.07056704e-02, -9.24823284e-02, -8.00895095e-02,  4.78515700e-02,
       -4.14728634e-02, -1.59983914e-02, -2.03076396e-02, -1.61702223e-02,
       -3.88611183e-02, -

##### Feature set for word2vec

In [24]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [25]:
wordvec_arrays = np.zeros((len(tokenized), 500)) 
for i in range(len(tokenized)):
    wordvec_arrays[i,:] = word_vector(tokenized[i], 500)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(2769, 500)

In [26]:
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.116164,0.15014,0.157637,0.167075,-0.038732,-0.195063,-0.037403,0.224993,0.061536,0.028788,...,0.004375,-0.051435,0.164655,-0.029094,0.072818,-0.0379,-0.047797,-0.140069,-0.112368,-0.064705
1,0.161522,0.152545,0.123183,0.146035,-0.009645,-0.130634,0.002202,0.188009,0.064717,0.047527,...,0.042883,-0.06683,0.183888,-0.068949,0.019747,-0.037597,-0.015067,-0.122631,-0.028011,-0.070028
2,0.15195,0.117054,0.097127,0.124619,-0.027188,-0.105692,0.009686,0.152977,0.043762,0.030666,...,0.019984,-0.054562,0.149631,-0.061866,0.016582,-0.045131,-0.006541,-0.095804,-0.03416,-0.059227
3,0.097068,0.135354,0.15129,0.133517,-0.018089,-0.183336,-0.046367,0.208999,0.050225,0.008495,...,-0.007296,-0.035852,0.121306,-0.010089,0.085524,-0.008289,-0.043293,-0.127748,-0.127939,-0.074941
4,0.149772,0.109539,0.090114,0.118735,-0.021739,-0.095942,0.010114,0.143584,0.040449,0.030532,...,0.024626,-0.055909,0.148624,-0.060887,0.010106,-0.044135,-0.004543,-0.088525,-0.022525,-0.055222


### Doc2Vec

### Count Vectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1,max_df=0.95,ngram_range=(1,3))
x=vectorizer.fit_transform(data_df['Review'])

### Vectorization (Tf-Idf)

In [228]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = TfidfVectorizer(min_df=1,max_df=0.95,ngram_range=(1,3))
x=vectorizer.fit_transform(data_df['Review'])


In [28]:
x=pd.DataFrame(x.toarray(),columns=list(vectorizer.get_feature_names()))

In [29]:
y=data_df['Sentiment']
y = np.nan_to_num(y)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size = 0.80)

In [31]:
X_train

Unnamed: 0,aailiyah,aailiyah pretti,aailiyah pretti good,abandon,abandon factori,abandon factori readi,abhor,abil,abil actual,abil actual know,...,zillion time,zillion time away,zombi,zombi movi,zombi movi avoid,zombi student,zombi student back,zombiez,zombiez part,zombiez part hellish
823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
621,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1646,0,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,0,0,0,0,0
2733,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report 

### Multinomial Naive Bayes

In [34]:
param_grid = { 'alpha':[0.2,0.4,0.6,0.8,1] }
grid = GridSearchCV(MultinomialNB(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
grid.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] alpha=0.2 .......................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................................ alpha=0.2, total=   1.8s
[CV] alpha=0.2 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV] ........................................ alpha=0.2, total=   2.0s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   2.0s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   1.8s
[CV] alpha=0.2 .......................................................
[CV] ........................................ alpha=0.2, total=   1.8s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   1.8s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   1.8s
[CV] alpha=0.4 .......................................................
[CV] ........................................ alpha=0.4, total=   1.6s
[CV] alpha=0.4 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.1min finished


{'alpha': 1}

### Logistic Regression

In [35]:
param_grid = { 'penalty':['l1','l2','elasticnet'], 'C':[0.1,1,10,100]}
grid = GridSearchCV(LogisticRegression(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
grid.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ................................ C=0.1, penalty=l1, total=   0.2s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.2s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.2s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.1s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.1s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   2.0s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   2.3s
[CV] C=0.1, penalty=l2 ...............................................
[CV] .

Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[CV] ........................ C=0.1, penalty=elasticnet, total=   0.2s
[CV] C=0.1, penalty=elasticnet .......................................
[CV] ........................ C=0.1, penalty=elasticnet, total=   0.2s
[CV] C=0.1, penalty=elasticnet .......................................
[CV] ........................ C=0.1, penalty=elasticnet, total=   0.1s
[CV] C=0.1, penalty=elasticnet .......................................
[CV] ........................ C=0.1, penalty=elasticnet, total=   0.2s
[CV] C=0.1, penalty=elasticnet .......................................
[CV] ........................ C=0.1, penalty=elasticnet, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] .................................. C=1, penalty=l1, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] .................................. C=1, penalty=l1, total=   0.2s

Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.




[CV] C=1, penalty=l1 .................................................
[CV] .................................. C=1, penalty=l1, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] .................................. C=1, penalty=l1, total=   0.1s
[CV] C=1, penalty=l1 .................................................
[CV] .................................. C=1, penalty=l1, total=   0.2s
[CV] C=1, penalty=l2 .................................................
[CV] .................................. C=1, penalty=l2, total=   3.0s
[CV] C=1, penalty=l2 .................................................
[CV] .................................. C=1, penalty=l2, total=   3.5s
[CV] C=1, penalty=l2 .................................................
[CV] .................................. C=1, penalty=l2, total=   2.9s
[CV] C=1, penalty=l2 .................................................
[CV] .................................. C=1, penalty=l2, total=   3.0s
[CV] 

Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[CV] .......................... C=1, penalty=elasticnet, total=   0.2s
[CV] C=1, penalty=elasticnet .........................................
[CV] .......................... C=1, penalty=elasticnet, total=   0.1s
[CV] C=1, penalty=elasticnet .........................................
[CV] .......................... C=1, penalty=elasticnet, total=   0.2s
[CV] C=1, penalty=elasticnet .........................................
[CV] .......................... C=1, penalty=elasticnet, total=   0.1s
[CV] C=1, penalty=elasticnet .........................................
[CV] .......................... C=1, penalty=elasticnet, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ................................. C=10, penalty=l1, total=   0.1s


Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] C=10, penalty=l1 ................................................
[CV] ................................. C=10, penalty=l1, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ................................. C=10, penalty=l1, total=   0.2s
[CV] C=10, penalty=l1 ................................................
[CV] ................................. C=10, penalty=l1, total=   0.1s
[CV] C=10, penalty=l1 ................................................
[CV] ................................. C=10, penalty=l1, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   4.3s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   4.3s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   3.9s
[CV] C

Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[CV] C=10, penalty=elasticnet ........................................
[CV] ......................... C=10, penalty=elasticnet, total=   0.2s
[CV] C=10, penalty=elasticnet ........................................
[CV] ......................... C=10, penalty=elasticnet, total=   0.2s
[CV] C=10, penalty=elasticnet ........................................
[CV] ......................... C=10, penalty=elasticnet, total=   0.1s
[CV] C=10, penalty=elasticnet ........................................
[CV] ......................... C=10, penalty=elasticnet, total=   0.2s
[CV] C=100, penalty=l1 ...............................................


Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ................................ C=100, penalty=l1, total=   0.1s
[CV] C=100, penalty=l1 ...............................................
[CV] ................................ C=100, penalty=l1, total=   0.1s
[CV] C=100, penalty=l1 ...............................................
[CV] ................................ C=100, penalty=l1, total=   0.2s
[CV] C=100, penalty=l1 ...............................................
[CV] ................................ C=100, penalty=l1, total=   0.2s
[CV] C=100, penalty=l1 ...............................................
[CV] ................................ C=100, penalty=l1, total=   0.1s
[CV] C=100, penalty=l2 ...............................................
[CV] ................................ C=100, penalty=l2, total=   5.3s
[CV] C=100, penalty=l2 ...............................................
[CV] ................................ C=100, penalty=l2, total=   4.8s
[CV] C=100, penalty=l2 ...............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] ................................ C=100, penalty=l2, total=   5.3s
[CV] C=100, penalty=l2 ...............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] ................................ C=100, penalty=l2, total=   5.6s
[CV] C=100, penalty=l2 ...............................................
[CV] ................................ C=100, penalty=l2, total=   5.2s
[CV] C=100, penalty=elasticnet .......................................


Traceback (most recent call last):
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\maria\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.



[CV] ........................ C=100, penalty=elasticnet, total=   0.2s
[CV] C=100, penalty=elasticnet .......................................
[CV] ........................ C=100, penalty=elasticnet, total=   0.2s
[CV] C=100, penalty=elasticnet .......................................
[CV] ........................ C=100, penalty=elasticnet, total=   0.1s
[CV] C=100, penalty=elasticnet .......................................
[CV] ........................ C=100, penalty=elasticnet, total=   0.2s
[CV] C=100, penalty=elasticnet .......................................
[CV] ........................ C=100, penalty=elasticnet, total=   0.2s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.3min finished


{'C': 1, 'penalty': 'l2'}

### Decision Tree Classifier

### Random Forest Classifier

In [37]:
param_grid = {'n_estimators': [200,500], 'max_features':['auto','sqrt','log2'] ,'criterion':['gini','entropy'] }
grid = GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
grid.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] criterion=gini, max_features=auto, n_estimators=200 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_features=auto, n_estimators=200, total= 1.8min
[CV] criterion=gini, max_features=auto, n_estimators=200 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min remaining:    0.0s


[CV]  criterion=gini, max_features=auto, n_estimators=200, total= 1.9min
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200, total= 1.9min
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200, total= 1.8min
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200, total= 1.9min
[CV] criterion=gini, max_features=auto, n_estimators=500 .............
[CV]  criterion=gini, max_features=auto, n_estimators=500, total= 4.6min
[CV] criterion=gini, max_features=auto, n_estimators=500 .............
[CV]  criterion=gini, max_features=auto, n_estimators=500, total= 4.6min
[CV] criterion=gini, max_features=auto, n_estimators=500 .............
[CV]  criterion=gini, max_features=auto, n_estimators=500, total= 4.5min
[CV] criterion=gini, max_features=auto, n_estimators=500 ......

[CV]  criterion=entropy, max_features=log2, n_estimators=500, total= 3.1min
[CV] criterion=entropy, max_features=log2, n_estimators=500 ..........
[CV]  criterion=entropy, max_features=log2, n_estimators=500, total= 3.0min


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 301.3min finished


{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 500}

### XGBoost

### SVM

In [None]:

param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
grid.best_params_

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total= 2.4min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total= 2.7min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total= 2.7min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total= 2.8min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total= 2.7min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 2.6min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 2.0min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.8min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] .

In [43]:
grid

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'crtiterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]},
             verbose=2)

In [35]:

model = svm.SVC(kernel='sigmoid')
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.76      0.85      0.80       284
         1.0       0.82      0.72      0.76       270

    accuracy                           0.78       554
   macro avg       0.79      0.78      0.78       554
weighted avg       0.79      0.78      0.78       554



In [32]:

import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)
\

models=[MultinomialNB(), LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(),AdaBoostClassifier()]

for model in models:
    score=cross_val_score(model,X_train,y_train,cv=5)
    model.fit(X_train,y_train.ravel())
    pred=model.predict(X_test)
    print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.82      0.77      0.79       284
         1.0       0.77      0.83      0.80       270

    accuracy                           0.80       554
   macro avg       0.80      0.80      0.80       554
weighted avg       0.80      0.80      0.80       554

              precision    recall  f1-score   support

         0.0       0.78      0.84      0.81       284
         1.0       0.82      0.76      0.79       270

    accuracy                           0.80       554
   macro avg       0.80      0.80      0.80       554
weighted avg       0.80      0.80      0.80       554

              precision    recall  f1-score   support

         0.0       0.73      0.77      0.75       284
         1.0       0.74      0.71      0.72       270

    accuracy                           0.74       554
   macro avg       0.74      0.74      0.74       554
weighted avg       0.74      0.74      0.74       554

              preci