# Importing Libraries 

In [1]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import log_loss

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import warnings

In [158]:
df = pd.read_csv('Fixed train.csv')
words = stopwords.words('english')

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Most common words

In [208]:
vocab = Counter()

for sentence in df.Phrase:
    for words in sentence.split(' '):
        vocab[words] += 1

vocab.most_common(100)

[('the', 34583),
 (',', 31356),
 ('a', 25066),
 ('of', 24153),
 ('and', 23762),
 ('to', 16686),
 ('.', 13131),
 ("'s", 12630),
 ('in', 10128),
 ('is', 9934),
 ('that', 9071),
 ('it', 7719),
 ('as', 6340),
 ('with', 5711),
 ('for', 5445),
 ('its', 5243),
 ('film', 4953),
 ('an', 4457),
 ('movie', 4353),
 ('this', 3805),
 ('be', 3803),
 ('but', 3724),
 ('on', 3517),
 ('The', 3481),
 ('you', 3440),
 ("n't", 2935),
 ('by', 2874),
 ('more', 2865),
 ('his', 2783),
 ('about', 2692),
 ('from', 2576),
 ('than', 2568),
 ('at', 2566),
 ('--', 2550),
 ('or', 2547),
 ('not', 2533),
 ('one', 2526),
 ('all', 2263),
 ('have', 2260),
 ('are', 2231),
 ('like', 2137),
 ("'", 2124),
 ('has', 2061),
 ('A', 2024),
 ('so', 1898),
 ('story', 1824),
 ('-RRB-', 1820),
 ('out', 1758),
 ('who', 1717),
 ('most', 1638),
 ('into', 1614),
 ('too', 1575),
 ('-LRB-', 1556),
 ('up', 1539),
 ('good', 1488),
 ('their', 1438),
 ('characters', 1411),
 ('...', 1379),
 ('`', 1370),
 ("''", 1360),
 ('``', 1357),
 ('can', 1347)

# Preprocessing

In [264]:
#Preprocessing text
def preprocessor(text):
    lemmatizer = WordNetLemmatizer()
    sentences = text.split(' ')
    data = []
    for word in sentences:
        word = re.sub('<[^>]*>', '', word)
        word = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','', word)
        word = re.sub("n't", 'not', word)
        word = re.sub('\d+','',word) #Removing digit
        word = re.sub(r'[^\w\s]','',word) #Removing non-alphanumerical words 
        word = word.lower()
        word = re.sub('(rrb)|(lrb)','', word)
        word = lemmatizer.lemmatize(word, pos = 'v')
        word = lemmatizer.lemmatize(word, pos = 'n')
        data.append(word)
    return ' '.join(data)

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [153]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('better', pos = 'a') #Example of lemmatizer 

'good'

# Count Vectorizer

In [454]:
from sklearn.model_selection import train_test_split

X = df['Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [442]:
count = CountVectorizer(stop_words = 'english', preprocessor = preprocessor,
                       ngram_range = (1,2))

clf = Pipeline([('vect', count),
               ('clf', LogisticRegression(solver = 'lbfgs', random_state = 0, max_iter = 700))])

clf.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2),
                                 preprocessor=<function preprocessor at 0x000001BF24152798>,
                                 stop_words='english')),
                ('clf', LogisticRegression(max_iter=700, random_state=0))])

In [437]:
y_hat = clf.predict(X_test)
print(accuracy_score(y_hat, y_test))

0.6473151352044085


In [443]:
test_set = pd.read_csv('test.csv')
prediction = clf.predict(test_set['Phrase'])

In [439]:
submission = pd.DataFrame({'Phrase': test_set['Phrase'], 'Sentiment': prediction})

In [453]:
submission.to_csv('Submission.csv')

# TfidfVectorizer

In [304]:
X = df['Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [346]:
tfidf = TfidfVectorizer(stop_words = 'english', preprocessor = preprocessor,
                       ngram_range = (1,2))

clf_2 = Pipeline([('vect', tfidf),
               ('clf', LogisticRegression(solver = 'lbfgs', random_state = 1, max_iter = 500))])

clf_2.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(steps=[('vect',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 preprocessor=<function preprocessor at 0x000001BF24152798>,
                                 stop_words='english', sublinear_tf=True)),
                ('clf', LogisticRegression(max_iter=500, random_state=1))])

In [356]:
y_hat = clf_2.predict(X_test)
print(accuracy_score(y_hat, y_test))

0.6249733008671878
