In [45]:
import pandas as pd
import numpy as np
from collections import Counter
from stop_words import get_stop_words
import contractions
import re
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import emoji

In [171]:
df = pd.read_csv("DisneylandReviews.csv", usecols=["Review_Text", "Rating"])

In [172]:
df1 = df[:20000]

In [139]:
df1.tail()

Unnamed: 0,Rating,Review_Text
19995,5,I love going to Disneyland. I am on my second ...
19996,5,nothing can be described in words about disney...
19997,5,everything about Disneyland is great!!!!! shor...
19998,5,We have been to Disneyland a few times. Each t...
19999,4,It has been a childhood dream of mine to go to...


In [136]:
df1.head()

Unnamed: 0,Rating,Review_Text
0,4,If you've ever been to Disneyland anywhere you...
1,4,Its been a while since d last time we visit HK...
2,4,Thanks God it wasn t too hot or too humid wh...
3,4,HK Disneyland is a great compact park. Unfortu...
4,4,"the location is not in the city, took around 1..."


In [138]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Rating       20000 non-null  int64 
 1   Review_Text  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [140]:
df1.sample(5)

Unnamed: 0,Rating,Review_Text
6213,3,"Nothing like Disneyland USA, if you are an adu..."
8158,4,I was actually not keen to visit for a 3rd tim...
3201,5,"Visited for the third time, this time with cou..."
7963,5,Visited in late May 2013. Perfect time to go a...
1844,4,New lands with new rides has made this a real ...


In [141]:
df1['Rating'].value_counts()

5    11129
4     5319
3     2323
2      766
1      463
Name: Rating, dtype: int64

In [142]:
df1.describe()

Unnamed: 0,Rating
count,20000.0
mean,4.29425
std,0.973455
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [143]:
df1.shape

(20000, 2)

#### Cleaning the data

#### Contraction

In [116]:
df['Review_Text']=df['Review_Text'].apply(lambda x: contractions.fix(x))

In [44]:
df.head()

#### Converting letters to lowercase

In [173]:
df1.loc[:,"Review_Text"] = df1.Review_Text.apply(lambda x : str.lower(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [174]:
df1.head()

Unnamed: 0,Rating,Review_Text
0,4,if you've ever been to disneyland anywhere you...
1,4,its been a while since d last time we visit hk...
2,4,thanks god it wasn t too hot or too humid wh...
3,4,hk disneyland is a great compact park. unfortu...
4,4,"the location is not in the city, took around 1..."


#### Removing punctuations

In [175]:
df1.loc[:,"Review_Text"] = df1.Review_Text.apply(lambda x : " ".join(re.findall('[\w]+',x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [176]:
df1.head()

Unnamed: 0,Rating,Review_Text
0,4,if you ve ever been to disneyland anywhere you...
1,4,its been a while since d last time we visit hk...
2,4,thanks god it wasn t too hot or too humid when...
3,4,hk disneyland is a great compact park unfortun...
4,4,the location is not in the city took around 1 ...


#### Removing Stop Words

In [177]:
stop_words = get_stop_words('en')

def remove_stopWords(s):
    '''For removing stop words
    '''
    s = ' '.join(word for word in s.split() if word not in stop_words)
    return s

df1.loc[:,"Review_Text"] = df1.Review_Text.apply(lambda x: remove_stopWords(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [178]:
df1.head()

Unnamed: 0,Rating,Review_Text
0,4,ve ever disneyland anywhere ll find disneyland...
1,4,since d last time visit hk disneyland yet time...
2,4,thanks god wasn t hot humid visiting park othe...
3,4,hk disneyland great compact park unfortunately...
4,4,location city took around 1 hour kowlon kids l...


#### Removing numbers

In [121]:
df["Review_Text"]=df.Review_Text.replace('\w*\d\w*', '', regex=True)

#### Removing the top 10 frequent words

In [180]:
cnt = Counter()
for txt in df1["Review_Text"].values:
    for word in txt.split():
        cnt[word]+=1
cnt.most_common(10)

[('park', 17943),
 ('disneyland', 16489),
 ('disney', 14575),
 ('rides', 14451),
 ('day', 13209),
 ('time', 11675),
 ('t', 11228),
 ('s', 10913),
 ('get', 8820),
 ('can', 8362)]

In [181]:
frequent_words = set([w for(w,wc) in cnt.most_common(10)])
def remover(txt):
    return " ".join([word for word in str(txt).split() if word not in frequent_words])

df1["Review_Text1"] = df1["Review_Text"].apply(lambda txt: remover(txt))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Review_Text1"] = df1["Review_Text"].apply(lambda txt: remover(txt))


Unnamed: 0,Rating,Review_Text,Review_Text1
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...


#### Removing rare words

In [187]:
num_rare_words = 10
rare_words = set([w for (w, wc) in cnt.most_common()[:-num_rare_words-1:-1]])

In [191]:
def remove_rare(text):
    return " ".join([word for word in str(text).split() if word not in rare_words])

In [193]:
df1["Review_Text2"] = df1["Review_Text1"].apply(lambda text: remove_rare(text))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Review_Text2"] = df1["Review_Text1"].apply(lambda text: remove_rare(text))


Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...


#### Stemming

In [198]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

In [200]:
df1["text_stemmed"] = df1["Review_Text2"].apply(lambda text: stem_words(text))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["text_stemmed"] = df1["Review_Text2"].apply(lambda text: stem_words(text))


Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...


#### Lemmatization

In [206]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [207]:
import nltk
df1["text_lemmatized"] = df1["text_stemmed"].apply(lambda text: lemmatize_words(text))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["text_lemmatized"] = df1["text_stemmed"].apply(lambda text: lemmatize_words(text))


Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...,sinc d last visit hk yet stay tomorrowland aka...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwis big is...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...,hk great compact unfortun quit bit mainten wor...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...,locat citi take around 1 hour kowlon kid like ...


#### Spelling Correction

In [221]:
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [222]:
df1["text_corrected"] = df1["text_lemmatized"].apply(lambda text: correct_spellings(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["text_corrected"] = df1["text_lemmatized"].apply(lambda text: correct_spellings(text))


In [223]:
df1.head()

Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized,text_corrected
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...,ve ever anywher all find hong kong similar lay...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...,sinc d last visit hk yet stay tomorrowland aka...,since i last visit he yet stay tomorrowland ak...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwise big i...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...,hk great compact unfortun quit bit mainten wor...,he great compact unfortune quit bit maintain w...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...,locat citi take around 1 hour kowlon kid like ...,local city take around 1 hour kowloon kid like...


#### Remove words with three letters or less

In [225]:
df1["Without3letters"]=df1.text_corrected.str.replace(r'\b(\w{1,3})\b', '')

  df1["Without3letters"]=df1.text_corrected.str.replace(r'\b(\w{1,3})\b', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Without3letters"]=df1.text_corrected.str.replace(r'\b(\w{1,3})\b', '')


In [226]:
df1.head()

Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized,text_corrected,Without3letters
0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...,ve ever anywher all find hong kong similar lay...,ever anywher find hong kong similar layout w...
1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...,sinc d last visit hk yet stay tomorrowland aka...,since i last visit he yet stay tomorrowland ak...,since last visit stay tomorrowland marvel ...
2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwise big i...,thank wasn humid visit otherwise issue sha...
3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...,hk great compact unfortun quit bit mainten wor...,he great compact unfortune quit bit maintain w...,great compact unfortune quit maintain work ...
4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...,locat citi take around 1 hour kowlon kid like ...,local city take around 1 hour kowloon kid like...,local city take around hour kowloon like muc...


In [227]:
#saving the data
df1.to_csv("C:/Users/mai_a/Desktop/Project4_NLP/data_cleaned.csv")

In [46]:
clean_df = pd.read_csv("C:/Users/mai_a/Desktop/Project4_NLP/data_cleaned.csv")

#### Checking for null values

In [47]:
clean_df.isnull().sum()

Unnamed: 0         0
Rating             0
Review_Text        0
Review_Text1       0
Review_Text2       0
text_stemmed       0
text_lemmatized    0
text_corrected     0
Without3letters    0
dtype: int64

In [48]:
clean_df.head()

Unnamed: 0.1,Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized,text_corrected,Without3letters
0,0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...,ve ever anywher all find hong kong similar lay...,ever anywher find hong kong similar layout w...
1,1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...,sinc d last visit hk yet stay tomorrowland aka...,since i last visit he yet stay tomorrowland ak...,since last visit stay tomorrowland marvel ...
2,2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwise big i...,thank wasn humid visit otherwise issue sha...
3,3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...,hk great compact unfortun quit bit mainten wor...,he great compact unfortune quit bit maintain w...,great compact unfortune quit maintain work ...
4,4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...,locat citi take around 1 hour kowlon kid like ...,local city take around 1 hour kowloon kid like...,local city take around hour kowloon like muc...


#### Removing Emojis

In [49]:
def split_count(info):
    return len([c for c in info if c in emoji.UNICODE_EMOJI])


clean_df["Emoji_Count"] = clean_df["Without3letters"].apply(split_count)
clean_df.head()

Unnamed: 0.1,Unnamed: 0,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized,text_corrected,Without3letters,Emoji_Count
0,0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...,ve ever anywher all find hong kong similar lay...,ever anywher find hong kong similar layout w...,0
1,1,4,since d last time visit hk disneyland yet time...,since d last visit hk yet stay tomorrowland ak...,since d last visit hk yet stay tomorrowland ak...,sinc d last visit hk yet stay tomorrowland aka...,sinc d last visit hk yet stay tomorrowland aka...,since i last visit he yet stay tomorrowland ak...,since last visit stay tomorrowland marvel ...,0
2,2,4,thanks god wasn t hot humid visiting park othe...,thanks god wasn hot humid visiting otherwise b...,thanks god wasn hot humid visiting otherwise b...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwis big is...,thank god wasn hot humid visit otherwise big i...,thank wasn humid visit otherwise issue sha...,0
3,3,4,hk disneyland great compact park unfortunately...,hk great compact unfortunately quite bit maint...,hk great compact unfortunately quite bit maint...,hk great compact unfortun quit bit mainten wor...,hk great compact unfortun quit bit mainten wor...,he great compact unfortune quit bit maintain w...,great compact unfortune quit maintain work ...,0
4,4,4,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,location city took around 1 hour kowlon kids l...,locat citi took around 1 hour kowlon kid like ...,locat citi take around 1 hour kowlon kid like ...,local city take around 1 hour kowloon kid like...,local city take around hour kowloon like muc...,0


In [50]:
clean_df.Emoji_Count.sum()

0

In [51]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"U00002707-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [52]:
clean_df['WithoutEmojis']=clean_df['Without3letters'].apply(lambda x: remove_emoji(x))

In [63]:
clean_df.to_csv("C:/Users/mai_a/Desktop/Project4_NLP/cleaned_data_final.csv")

## Modeling

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [10]:
model_data = pd.read_csv("C:/Users/mai_a/Downloads/final_data_20.csv")

In [11]:
model_data.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Rating,Review_Text,Review_Text1,Review_Text2,text_stemmed,text_lemmatized,text_corrected,Without3letters,Emoji_Count,WithoutEmojis,Removed_words,Reviews_words
0,0,0,0,4,ve ever disneyland anywhere ll find disneyland...,ve ever anywhere ll find hong kong similar lay...,ve ever anywhere ll find hong kong similar lay...,ve ever anywher ll find hong kong similar layo...,ve ever anywher ll find hong kong similar layo...,ve ever anywher all find hong kong similar lay...,ever anywher find hong kong similar layout w...,0,,ever anywher find hong kong similar layout wal...,ever anywher find hong kong similar layout wal...


In [12]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,5), analyzer='char')

In [13]:
X = tfidf.fit_transform(model_data["Reviews_words"])
y = model_data["Rating"]
X

<20000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 12636819 stored elements in Compressed Sparse Row format>

In [14]:
X.shape, y.shape

((20000, 10000), (20000,))

In [28]:
y.value_counts()

5    11129
4     5319
3     2323
2      766
1      463
Name: Rating, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [16]:
X_train1, X_val, y_train1, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=0)

In [17]:
X_train.shape

(16000, 10000)

In [19]:
clf = LinearSVC(C=10, class_weight="balanced")
clf.fit(X_train1, y_train1)

LinearSVC(C=10, class_weight='balanced')

In [20]:
clf.score(X_val,y_val)

0.553125

In [21]:
clf.score(X_test,y_test)

0.5415

In [None]:
## Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C = 1) 
lr.fit(X_train1, y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1)

In [23]:
lr.score(X_val,y_val)

0.6125

In [24]:
lr.score(X_test,y_test)

0.59575

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [26]:
rf_clf = RandomForestClassifier(n_estimators = 100,max_features=5, max_depth=7,min_samples_leaf=2, random_state = 0)
rf_clf.fit(X_train1, y_train1)
print('Accuracy on validation set:',rf_clf.score(X_val,y_val))
print('Accuracy on test set:',rf_clf.score(X_test,y_test))

Accuracy on validation set: 0.559375
Accuracy on test set: 0.5545


## KNN

In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train1, y_train1)
print('Accuracy on validation set:',knn.score(X_val,y_val))
print('Accuracy on test set:',knn.score(X_test, y_test))

Accuracy on validation set: 0.5253125
Accuracy on test set: 0.52625


In [94]:
x = "the weather was nice."
#x = get_clean(x)
vec = tfidf.transform()
clf.predict(vec)

array([4], dtype=int64)

In [29]:
import pickle
with open(f"C:/Users/mai_a/Desktop/Project4_NLP/clf_model.pickle", "wb") as pfile:
    pickle.dump(clf, pfile)
    