In [1]:
import pandas as pd
import nltk
import string
import re
import numpy as np
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from math import log
from sklearn.metrics import *
from nltk.corpus import stopwords

# Getting the train set and preprocessing it

In [2]:
# Lambda is a small function 
data_train = pd.read_csv('../input/task1a3/sentiment_train.csv')
data_test = pd.read_csv('../input/task1a3/sentiment_test.csv')
# Change this line to moderate the train size
tweets_train=data_train['5'][0:50000]
# Lower Case
tweets_train=tweets_train.apply(lambda s:s.lower())

In [3]:
# splitting by space to split the tweets and nomalize username
for i in tqdm(range(len(tweets_train))):
    temp=tweets_train[i].split(" ")
    for j in range(len(temp)):
        if(re.match("^@[a-z_0-9]+$",temp[j])):
            temp[j]="@someusername"
    tweets_train[i]=temp
tweets_train.head()

100%|██████████| 50000/50000 [00:02<00:00, 21713.27it/s]


0                            [another, fire,, yayy., ]
1     [@someusername, , , be, careful, lil, lo, b, , ]
2    [@someusername, oh, yes, it, was, good., i, ho...
3    [dont, get, too, close, or, you, will, get, hu...
4              [@someusername, any, big, e3, plans?, ]
Name: 5, dtype: object

In [4]:
# Nomalize urls in the splitted tweets
for i in tqdm(range(len(tweets_train))):
    temp=tweets_train[i]
    for j in range(len(temp)):
        if(re.match("^http://",temp[j]) or re.match("^https://",temp[j])):
            temp[j]="http://someurl"
    tweets_train[i]=temp

100%|██████████| 50000/50000 [00:03<00:00, 14948.71it/s]


In [5]:
# Combining the tweets together
for i in tqdm(range(len(tweets_train))):
    sent=""
    for word in tweets_train[i]:
        sent+=word+" "
    tweets_train[i]=sent

100%|██████████| 50000/50000 [00:01<00:00, 45019.84it/s]


# Getting the test set and preprocessing it

In [7]:
tweets_test=(data_test['5'])
tweets_test=tweets_test.apply(lambda s:s.lower())


100%|██████████| 480000/480000 [00:24<00:00, 19735.26it/s]


In [None]:
# splitting by space to split the tweets and nomalize username
for i in tqdm(range(len(tweets_test))):
    temp=tweets_test[i].split(" ")
    for j in range(len(temp)):
        if(re.match("^@[a-z_0-9]+$",temp[j])):
            temp[j]="@someusername"
    tweets_test[i]=temp

In [8]:
# splitting by space to split the tweets and nomalize username
for i in tqdm(range(len(tweets_test))):
    temp=tweets_test[i]
    for j in range(len(temp)):
        if(re.match("^http://",temp[j]) or re.match("^https://",temp[j])):
            temp[j]="http://someurl"
    tweets_test[i]=temp

100%|██████████| 480000/480000 [00:33<00:00, 14152.75it/s]


In [9]:
# Combining the tweets together
for i in tqdm(range(len(tweets_test))):
    sent=""
    for word in tweets_test[i]:
        sent+=word+" "
    tweets_test[i]=sent

100%|██████████| 480000/480000 [00:10<00:00, 44653.04it/s]


## Constructing the stop Words List

In [11]:
lemmatizer = WordNetLemmatizer()   
OriginalStopWords=stopwords.words('english')
StopWordSet=set()
# Lemmatizing th Stopwords
for i in OriginalStopWords:
    StopWordSet.add(lemmatizer.lemmatize(i))

## Constructing the unigrams vocab from train set and test set

In [13]:

Vocab={}
tokenizer = nltk.RegexpTokenizer(r"\w+")
for i in tqdm(range(len(tweets_train))):
#     lower case and remove punctuation
    tweet=tokenizer.tokenize(tweets_train[i].lower())
    for word in tweet:
        OriginWord=lemmatizer.lemmatize(word)
        if(OriginWord not in StopWordSet):
            if(OriginWord in Vocab):
                Vocab[OriginWord]+=1
            else:
                Vocab[OriginWord]=1

100%|██████████| 50000/50000 [00:06<00:00, 7864.43it/s]


In [15]:
for i in tqdm(range(len(tweets_test))):
#     lower case and remove punctuation
    tweet=tokenizer.tokenize(tweets_test[i].lower())
    for word in tweet:
        OriginWord=lemmatizer.lemmatize(word)
        if(OriginWord not in StopWordSet):
            if(OriginWord in Vocab):
                Vocab[OriginWord]+=1
            else:
                Vocab[OriginWord]=1

100%|██████████| 480000/480000 [01:01<00:00, 7784.16it/s]


### Removing the words from the tweet which are in stop words set and have frequency count less than 10

In [19]:
for i in tqdm(range(len(tweets_train))):
    tweet=tokenizer.tokenize(tweets_train[i])
    ModifiedTweet=""
    for word in tweet:
        OriginWord=lemmatizer.lemmatize(word)
        if(OriginWord not in StopWordSet and Vocab[OriginWord]>10):
            ModifiedTweet+=OriginWord+" "
    tweets_train[i]=ModifiedTweet

100%|██████████| 50000/50000 [00:06<00:00, 7162.19it/s]


In [21]:
for i in tqdm(range(len(tweets_test))):
    tweet=tokenizer.tokenize(tweets_test[i])
    ModifiedTweet=""
    for word in tweet:
        OriginWord=lemmatizer.lemmatize(word)
        if(OriginWord not in StopWordSet and Vocab[OriginWord]>10):
            ModifiedTweet+=OriginWord+" "
    tweets_test[i]=ModifiedTweet

100%|██████████| 480000/480000 [01:08<00:00, 6981.30it/s]


In [None]:
ReducedVocab=set()
for i in Vocab:
    if(Vocab[i]>10):
        ReducedVocab.add(i)

In [26]:
vectorizer = CountVectorizer(ngram_range=(1,1),vocabulary=ReducedVocab)
lis=vectorizer.get_feature_names()
ReplaceNameDict={}
for i in range(len(lis)):
    ReplaceNameDict[i]=lis[i]

### From the reduced Vocab construct unigram features for test and train set and saving them

In [27]:
X = vectorizer.fit_transform(tweets_train)
a=X.toarray()
train_frame=pd.DataFrame(data=a)
train_frame.rename(columns=ReplaceNameDict,inplace=True)
train_frame["Label"]=data_train['0'][0:50000]
train_frame.to_csv("train10000.csv",index=False)

In [28]:
X = vectorizer.fit_transform(tweets_test)
a=X.toarray()
test_frame=pd.DataFrame(data=a)
test_frame.rename(columns=ReplaceNameDict,inplace=True)
test_frame["Label"]=data_test['0']
test_frame.to_csv("test50000.csv",index=False)

In [29]:
train_frame.to_csv("train50000_100limit.csv",index=False)
test_frame.to_csv("test_full.csv",index=False)

In [30]:
train_frame["Label"].value_counts()

4    25096
0    24904
Name: Label, dtype: int64

In [31]:
test_frame["Label"].value_counts(0)

4    240796
0    239204
Name: Label, dtype: int64

# Training the Naive Bayes

In [32]:
CountPositive=0
CountNegative=0
for i in train_frame["Label"]:
    if(i==0):
        CountNegative+=1
    else:
        CountPositive+=1
# print(CountPositive)
# print(CountNegative)

In [33]:
ProbabPositive=CountPositive/train_frame["Label"].count()
ProbabNegative=CountNegative/train_frame["Label"].count()
# print(ProbabPositive,ProbabNegative)

In [34]:
PositiveDictionary={}
NegativeDictionary={}
for i in ReducedVocab:
    PositiveDictionary[i]=0
    NegativeDictionary[i]=0
# print(PositiveDictionary,NegativeDictionary)

In [35]:
for i in tqdm(range(len(train_frame))):
    if(train_frame["Label"][i]==0):
        for word in NegativeDictionary:
            NegativeDictionary[word]+=train_frame[word][i]
    else:
        for word in PositiveDictionary:
            PositiveDictionary[word]+=train_frame[word][i]
# print(PositiveDictionary,NegativeDictionary)

100%|██████████| 50000/50000 [41:02<00:00, 20.31it/s]


In [36]:
CountPositiveWords=0
CountNegativeWords=0
for i in PositiveDictionary:
    CountPositiveWords+=PositiveDictionary[i]
for i in NegativeDictionary:
    CountNegativeWords+=NegativeDictionary[i]
# print(CountNegativeWords,CountPositiveWords)

In [37]:
for i in ReducedVocab:
    PositiveDictionary[i]+=1
    PositiveDictionary[i]/=(CountPositiveWords+len(ReducedVocab))
    NegativeDictionary[i]+=1
    NegativeDictionary[i]/=(CountNegativeWords+len(ReducedVocab))

In [40]:
import pickle
file1=open("PositiveDictionary",'wb')
file2=open("NegativeDictionary",'wb')
file3=open("ProbabPositive",'wb')
file4=open("ProbabNegative",'wb')

pickle.dump(PositiveDictionary,file1)
pickle.dump(NegativeDictionary,file2)
pickle.dump(ProbabPositive,file3)
pickle.dump(ProbabNegative,file4)
file1.close()
file2.close()
file3.close()
file4.close()

# Testing the Naive Bayes

In [41]:
ypred=np.zeros(len(test_frame))
for i in tqdm(range(len(test_frame))):
    TempProbabPositive=log(ProbabPositive)
    TempProbabNegative=log(ProbabNegative)
    for j in ReducedVocab:
        if(test_frame[j][i]>0):
            TempProbabNegative+=test_frame[j][i]*log(NegativeDictionary[j])
            TempProbabPositive+=test_frame[j][i]*log(PositiveDictionary[j])
    if(TempProbabNegative>TempProbabPositive):
        ypred[i]=0
    else:
        ypred[i]=4

100%|██████████| 480000/480000 [6:27:57<00:00, 20.62it/s]  


In [42]:
ytemp=ypred
import pickle
file=open("ypred",'wb')
pickle.dump(ypred,file)
file.close()