In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [2]:
train=pd.read_csv(r'train_2kmZucJ.csv')
test=pd.read_csv(r'test_oJQbWVk.csv')

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
train.isnull().sum().sum(),test.isnull().sum().sum()

(0, 0)

In [5]:
train.size ,test.size

(23760, 3906)

In [6]:
X=train['tweet']
y=train['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# pre-processing 

In [8]:
def pre_processing(text):
    text=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
    text1=text.lower()
    
    #Remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    text2= text1.translate(translator)
    
    # remove whitespace from text 
    text3=" ".join(text2.split())
    
    # Remove default stopwords,Stemming,Lemmatization 
    word_tokens = word_tokenize(text3) 
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words_remove = [word for word in word_tokens if word not in stop_words] #Remove default stopwords
    filtered_word= [WordNetLemmatizer().lemmatize(word, pos ='v') for word in stop_words_remove]
    return filtered_word

In [9]:
def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in pre_processing(tweet):
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] +=1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] =1

    return result

In [10]:
freqs=count_tweets({},X_train,y_train)

In [11]:
def lookup(freqs, word, label):
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [18]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0
    vocab=set([pair[0] for pair in freqs.keys()])
    V=len(vocab)
    # calculate N_pos and N_neg
    N_pos=N_neg=0
    for pair in freqs.keys():
        if pair[1]>0:
            N_pos +=freqs[pair]
        else:
            N_neg+=freqs[pair]
    # Calculate logprior
    D=len(train_y)
    D_pos=len(list(filter(lambda x:x>0,train_y)))
    D_neg=(len(list(filter(lambda x:x<=0,train_y))))
    logprior=np.log(D_pos)-np.log(D_neg)
    
    # For each word in the vocabulary
    for word in vocab:
        freq_pos=lookup(freqs,word,1)
        freq_neg=lookup(freqs,word,0)
        
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg =(freq_neg+1)/(N_neg+V)
        
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    return logprior, loglikelihood

In [19]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = pre_processing(tweet)
    p=0
    p+=logprior
    for word in word_l:
        if word in loglikelihood:
            p+=loglikelihood[word]
    return p

In [20]:
logprior, loglikelihood=train_naive_bayes(freqs,X_train,y_train)

In [22]:
for tweet in X_test:
    y_hats=[]
    for tweet in X_test:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i =1
        else:
            y_hat_i =0
        y_hats.append(y_hat_i)

In [35]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_hats))

0.8978576893649579


In [28]:
for tweet in test['tweet']:
    y_ha=[]
    for tweet in test['tweet']:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i =1
        else:
            y_hat_i =0
        y_ha.append(y_hat_i)

In [38]:
#submission file by lgb classifer
pred_sub=pd.DataFrame(y_ha,columns=['label'])
final_data=pd.DataFrame(pd.concat([test['id'],pred_sub],axis=1))
final_data.to_csv(r'submission.csv',index=False)