
<div class="alert alert-block alert-info">
<b>Project:</b> TWITTER SENTIMENT ANALYSIS - MODEL FOR PREDICTING POLARITY
</div>



In [1]:
#importing data into dataframe
import pandas as pd
data = pd.read_csv(r"C:\Users\USER\Downloads\twitter_new.csv", encoding='latin-1', names=['target','id','date', 'flag', 'user', 'text'])

data.size

9600000

In [None]:
#For training purpose we will use only a small portion of the given data ,here 10000 samples 5000+5000.
from random import sample
pos = data[data['target']==4]
neg = data[data['target']==0]
samp_pos = pos.sample(5000)
samp_neg = neg.sample(5000)

samp = pd.concat([samp_pos, samp_neg], axis=0)
samp['target'] = samp['target'].replace([4],1

In [None]:
# Now we will use this samp dataset to test our code for text preprocessing.
#text pre-processing steps
#1. Converting to lower case
def lower(tweet):
    return (tweet.lower())

#2. Remove numbers using regex.
import re
def rm_digits(tweet):
    result = re.sub(r'/d+', '', tweet)
    return result
#3. Remove punctuations
import string
def rm_punct(tweet):
    for punctuation in string.punctuation:
        tweet = tweet.replace(punctuation, '')
    return tweet    

#4. Remove whitespaces
def rm_whitespaces(tweet):
    res = " ".join(tweet.split()) 
    return res

#5. Remove stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def rm_stpwrds(tweet):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
    res = [x for x in word_tokens if x not in stop_words]
    return res

#6. Stemming - Converting the word tokens to stem/root word(Grouping of different inflected forms of same words)
from nltk.stem.porter import PorterStemmer
def stemr(tweet):
    stemmer = PorterStemmer()
    res = [stemmer.stem(x) for x in tweet]
    return res
    
#7. Lemmatization - to get valid words from the word tokens
from nltk.stem import WordNetLemmatizer
def lem(tweet):
    lemmatizer = WordNetLemmatizer()
    res = [lemmatizer.lemmatize(x, pos='v') for x in tweet]
    return res
    
def join_tokens(tweet):
    res = ' '.join(tweet)
    return res



#lower case
samp['text'] = samp['text'].apply(lambda x: lower(x))
#Digit removal
samp['text'] = samp['text'].apply(lambda x: rm_digits(x))
#Punctuation removal
samp['text'] = samp['text'].apply(lambda x: rm_punct(x))
#Whitespace removal
samp['text'] = samp['text'].apply(lambda x: rm_whitespaces(x))
#removing stopwords
samp['text'] = samp['text'].apply(lambda x: rm_stpwrds(x))
#stemming
samp['text'] = samp['text'].apply(lambda x: stemr(x))
#lemmatizing
samp['text'] = samp['text'].apply(lambda x: lem(x))



# Converting the samp['text'] back to text usig join
samp['text'] = samp['text'].apply(lambda x: join_tokens(x))


In [None]:


# extract the features using count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
vectorizer = CountVectorizer(dtype ='uint8')
df_countvectorizer = vectorizer.fit_transform(samp['text'])
df_countvectorizer.shape

# splitting the features into train and test
X_train, X_test, y_train, y_test = train_test_split(  
                                                df_countvectorizer,  
                                                samp["target"],   
                                                test_size=0.2,  
                                                 random_state=0)




In [None]:
# Get the number of features (unique words)
num_features = len(vectorizer.vocabulary_)
print(num_features)

In [None]:
# Multinomial Naive Bayes Classifier
# Logistic Classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

Log_classifier = LogisticRegression(max_iter=1500)
Log_classifier.fit(X_train, y_train)


MNB_classifier = MultinomialNB()
MNB_classifier.fit(X_train, y_train)


svc = svm.SVC(kernel = 'linear')
svc.fit(X_train, y_train)





In [None]:
# predicting the labels 
from sklearn import metrics

model_list = [Log_classifier, MNB_classifier, svc]

for classifier in model_list:
    y_pred = classifier.predict(X_test)
    acc = metrics.accuracy_score(y_test,y_pred)
    precision  = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    
    print(f"Performace of {classifier}")
    print(f"Accuracy = {acc}")
    print(f"Precision = {precision}")
    print(f"Recall = {recall}", end = '\n')
    
    print("--"*55)
    


In [None]:
# model accuracy
#print(classification_report(y_test, pred))