In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [None]:
df = pd.read_csv('./sentiment140/training.1600000.processed.noemoticon.csv',encoding='latin',names=['target','id','date','flag','user','text'])

In [None]:
df.head()

In [None]:
df.drop(columns=['id','date','user','flag'],inplace=True)
df = df[['text','target']]

In [None]:
df.isna().sum()

In [None]:
df.target.value_counts().plot(kind='bar')

In [None]:
df['target'] = df['target'].replace(4,1)

In [None]:
df.shape

In [None]:
stemmer = nltk.SnowballStemmer("english")
STOPWORDS = stopwords.words('english')
STOPWORDS.remove('not')

def clean_text(text):
    
    text = str(text).lower()                                           
    text = re.sub(r'@\S+|http\S+|www.\S+|\n','',text)                  
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)                        
    
    text = [stemmer.stem(word) for word in text.split(' ')]            
    text = " ".join([word for word in text if word not in STOPWORDS])  
    text = text.strip()                                                
     
    return text


df['text'] = df['text'].apply(clean_text)

In [None]:
df.head()

In [None]:
 wc = WordCloud(width=800, height=600, random_state=42,max_font_size=100).generate(df['text'][0])

 plt.figure(figsize = (15,10))
 plt.imshow(wc, interpolation ='bilinear')
 plt.axis('off')
 plt.show()

In [None]:
model = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), MultinomialNB(alpha=1.0))

In [None]:
X_train, X_test, y_train,y_test = train_test_split(df['text'],df['target'],test_size =0.2 , random_state = 42)

model.fit(X_train,y_train)

In [None]:
predicted = model.predict(X_test)

In [None]:
accuracy_score(y_test,predicted)

In [None]:
def predict_sentiment(tweet):
    tweet = clean_text(tweet)
    y = model.predict([tweet])
    if(y[0]==0):
        print('negative')
    else:
        print('positive')

In [None]:
tweet1 = ''' Cuban police and protesters battling one another in the streets of Havana. '''
tweet2 = ''' I am feeling lucky today!! '''

predict_sentiment(tweet1)
predict_sentiment(tweet2)