In [77]:
#importing python modules
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [78]:
#downloading and reading the datasets
train_dataset = pd.read_csv('train_dataset.csv')
test_dataset = pd.read_csv('test_dataset.csv')

In [79]:
#data cleaning by removing stopwords, punctuation and url link
train_dataset['tweet'] = train_dataset['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train_dataset['tweet'] = train_dataset['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
train_dataset['tweet'] = train_dataset['tweet'].str.replace(r'http\S+','')
train_dataset['tweet'] = train_dataset['tweet'].str.replace('[^\w\s]','')
train_dataset['tweet'].head()

0    fingerprint pregnancy test  android apps beaut...
1    finally transparant silicon case  thanks uncle...
2    love this would go talk makememories unplug re...
3    im wired know im george made way  iphone cute ...
4    amazing service apple even talk question unles...
Name: tweet, dtype: object

In [80]:
#word tokenization and stemming
tokenized_word = train_dataset['tweet'].apply(lambda x: word_tokenize(x))
stemmer = PorterStemmer()
stemmed_word = tokenized_word.apply(lambda x: [stemmer.stem(each_word) for each_word in x] )

for i in range(len(stemmed_word)):
    stemmed_word[i] = ' '.join(stemmed_word[i])

train_dataset['tweet'] = stemmed_word
train_dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test android app beauti c...
1,2,0,final transpar silicon case thank uncl yay son...
2,3,0,love thi would go talk makememori unplug relax...
3,4,0,im wire know im georg made way iphon cute dave...
4,5,1,amaz servic appl even talk question unless pay...


In [81]:
#feature extraction using Bag of Words
bow_vectorizar = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
feature_matrix = bow_vectorizar.fit_transform(train_dataset['tweet'])
print(feature_matrix[1])

  (0, 987)	1
  (0, 793)	1
  (0, 989)	1
  (0, 871)	1
  (0, 156)	1
  (0, 305)	1


In [82]:
#creating the logistic regression model
x_train, x_val, y_train, y_val = train_test_split(feature_matrix[:7920,:], train_dataset['label'], test_size=0.3)

model = LogisticRegression(verbose=1)
model.fit(x_train, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=1, warm_start=False)

In [83]:
prediction = model.predict_proba(x_val)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(y_val, prediction_int)

0.787436084733382

In [92]:
#calculating prediction on test dataset and saving it to the file 
test_dataset['tweet'] = test_dataset['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_dataset['tweet'] = test_dataset['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
test_dataset['tweet'] = test_dataset['tweet'].str.replace(r'http\S+','')
test_dataset['tweet'] = test_dataset['tweet'].str.replace('[^\w\s]','')


tokenized_word = test_dataset['tweet'].apply(lambda x: word_tokenize(x))
stemmer = PorterStemmer()
stemmed_word = tokenized_word.apply(lambda x: [stemmer.stem(each_word) for each_word in x] )

for i in range(len(stemmed_word)):
    stemmed_word[i] = ' '.join(stemmed_word[i])

test_dataset['tweet'] = stemmed_word
test_dataset.head()

bow_vectorizar = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
feature_matrix = bow_vectorizar.fit_transform(test_dataset['tweet'])
print(feature_matrix[1])


test_pred = model.predict_proba(feature_matrix)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test_dataset['label'] = test_pred_int
submission = test_dataset[['id','label']]
submission.to_csv('sentiment_bow.csv', index=False)