<a href="https://colab.research.google.com/github/luizinfpp/python-notebooks/blob/sentiment-analysis/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

#Gathering data

In [2]:
data_neg = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/sentiment/neg_tweets.txt", header=None)
data_pos = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/sentiment/pos_tweets.txt", header=None)

In [3]:
data_neg = data_neg.rename(columns={0: 'tweet'})
data_pos = data_pos.rename(columns={0: 'tweet'})

In [4]:
data_neg["label"] = "neg"
data_pos["label"] = "pos"

In [5]:
data_train = pd.concat([data_neg, data_pos])
data_train = data_train.reset_index(drop=True)

#Modelling

In [None]:
vectorizer = CountVectorizer(analyzer = 'word', lowercase = False, stop_words='english')
features = vectorizer.fit_transform(data_train['tweet'].values)
features_nd = features.toarray() 

##Train

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_train['label'].values,
        train_size=0.8, 
        random_state=1234)

##Model

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)

##Accuracy

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.78


#With NLTK

In [6]:
import nltk

In [59]:
def tokenize(sent):
    return({word: True for word in sent})

In [27]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
data_all_nltk = []

for index, row in data_train.iterrows():
    data_all_nltk.append([nltk.word_tokenize(row['tweet']),row['label']])

print(data_all_nltk)

[[['@', 'iggigg', 'too', 'busy', 'to', 'see', 'me', 'in', 'London', 'this', 'evening', '.', 'What', 'is', 'a', 'boy', 'to', 'do', '?'], 'neg'], [['cavs', 'lost', ',', 'and', 'I', 'got', 'this', 'sinking', 'feeling', 'we', 'are', 'going', 'to', 'lose', 'Lebron', 'in', '2010', 'also', '...', 'why', 'must', 'my', 'home', 'city', 'SUCK', '?', 'Ah', 'well', ',', 'LETS', 'GO', 'BROWNS', '!'], 'neg'], [['the', 'closest', 'BGT', 'tour', 'is', 'Cardiff', 'or', 'London', 'dam', 'it', 'why', 'doesnt', 'anybody', 'other', 'than', 'the', 'Chuckle', 'Brothers', 'tour', 'the', 'Westcountry', '?'], 'neg'], [['Why', 'do', 'other', 'pet', 'care', 'people', 'try', 'to', 'run', 'others', 'out', 'of', 'business', '?', 'Or', 'send', 'suspicious', 'e-mails', 'fishing', 'for', 'info', '?'], 'neg'], [['-gasps-', 'dananananaykroyd', 'touring', 'aussie', '....', 'GRRR', 'WHY', 'ALWAYS', 'SYDNEY', 'AND', 'MELBZ', 'FFS'], 'neg'], [['....', 'If', 'i', 'am', 'going', 'warsal', 'that', 'means', 'no', 'church', 'in', 

In [None]:
from nltk.tag import pos_tag

print(pos_tag(data_all_nltk[0][0]))

In [33]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(data_all_nltk[0][0]))

['@', 'iggigg', 'too', 'busy', 'to', 'see', 'me', 'in', 'London', 'this', 'evening', '.', 'What', 'be', 'a', 'boy', 'to', 'do', '?']


In [34]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [35]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print(remove_noise(data_all_nltk[0][0], stop_words))

['iggigg', 'busy', 'see', 'london', 'evening', 'boy']


In [37]:
cleaned_tokens_list = []

for element in data_all_nltk:
    cleaned_tokens_list.append(remove_noise(element[0], stop_words))

In [38]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_words = get_all_words(cleaned_tokens_list)

In [39]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_words)
print(freq_dist_pos.most_common(10))

[('...', 278), ("n't", 239), ('get', 229), ('go', 223), ("'s", 212), ("'m", 159), ('..', 136), ('day', 110), ('good', 105), ('work', 100)]


In [47]:
cleaned_data_nltk = []

for element in data_all_nltk:
    cleaned_data_nltk.append([remove_noise(element[0], stop_words), element[1]])

In [60]:
data_nltk_for_model = []

for element in cleaned_data_nltk:
    data_nltk_for_model.append([tokenize(element[0]),element[1]])

In [61]:
import random

random.shuffle(data_nltk_for_model)

In [62]:
train_nltk = data_nltk_for_model[:int((.8)*len(data_nltk_for_model))]
test_nltk = data_nltk_for_model[int((.8)*len(data_nltk_for_model)):]

In [63]:
print(train_nltk)

[[{'uhmmm': True, 'squarespace': True, 'need': True, 'iphone': True, 'please': True}, 'neg'], [{'wonder': True, 'jon': True, 'think': True, 'see': True, "'s": True, 'tweet': True, 'picture': True, 'jordan': True, 'bust': True, 'nut': True, 'laughing': True, 'us': True, 'oh': True, 'yeah': True, '..': True, '...': True}, 'pos'], [{'keeping': True, 'finger': True, 'cross': True, 'buddy': True, 'feel': True, 'well': True}, 'neg'], [{'kvay2k': True, 'hope': True, 'wonderful': True, 'night': True, 'sent': True, 'message': True, 'youtube': True}, 'pos'], [{'kiki_huggles': True, 'wana': True, 'go': True, 'back': True, 'london': True, 'tooooooooo': True}, 'neg'], [{'starting': True, '2nd': True, 'shift': True, 'im': True, 'go': True, 'miss': True, 'like': True, 'hour': True, 'lakers': True, 'game': True}, 'neg'], [{'wow': True, 'bad': True, 'headache': True, 'month': True, 'someone': True, 'make': True, 'stop': True}, 'neg'], [{'watch': True, 'catherine': True, 'tate': True, 'video': True, 'cl

In [64]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_nltk)

In [65]:
classifier.show_most_informative_features()

Most Informative Features
                 awesome = True              pos : neg    =     27.2 : 1.0
                headache = True              neg : pos    =     21.9 : 1.0
                 amazing = True              pos : neg    =     12.5 : 1.0
                   thank = True              pos : neg    =     12.5 : 1.0
                    nice = True              pos : neg    =     11.5 : 1.0
                   amaze = True              pos : neg    =     11.0 : 1.0
               beautiful = True              pos : neg    =     11.0 : 1.0
                    love = True              pos : neg    =     10.8 : 1.0
                    lose = True              neg : pos    =     10.7 : 1.0
                   great = True              pos : neg    =     10.6 : 1.0


In [66]:
print(classifier.classify(tokenize('Nice day')))

neg


In [67]:
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(nltk.word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

neg


In [68]:
from nltk import classify

print(classify.accuracy(classifier, test_nltk))

0.815


#Plotting

In [None]:
import matplotlib.pyplot as plt

plt.scatter()
plt.show()