In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
import re, string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('twitter_samples')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
! pip install -q kaggle

In [3]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hawkeye619","key":"c29e45f048307718e33b1011c2b04cf5"}'}

In [4]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 38.6MB/s]
100% 80.9M/80.9M [00:02<00:00, 36.5MB/s]


In [6]:
from zipfile import ZipFile
file_name = "sentiment140.zip"

with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('Done')

Done


In [8]:
import pandas as pd
df = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin', header=None, 
                 names=["polarity", "id", "date", "flag", "user", "tweet"])
df = df.sample(n=14000)
df.head()
tweet_dataset = df.drop(["id","flag","date","user"], axis = 1)
tweet_dataset.head()

Unnamed: 0,polarity,tweet
758589,0,@ResourcefulMom How did your new family membe...
1042315,4,@dragonzord12 it should be available on the we...
1497590,4,chillin with adam just like the good ole days
1455349,4,875 goodnight! 1 more tetris game then grey's...
1364754,4,@Jayde_Nicole MEEE??? I'm Canadian TOO! New B...


In [9]:
def preprocess_tweet_text(tweet):
  
  #convert all text lowercase
  tweet = tweet.lower()

  #remove any urls
  tweet = re.sub(r"http\S+", "", tweet, flags=re.MULTILINE)

  #remove punctuations
  tweet = tweet.translate(str.maketrans("","",string.punctuation))

  #remove user @ references and '#' from tweet
  tweet = re.sub(r'\@\w+|\#',"",tweet)

  #remove stopwords
  tweet_tokens = word_tokenize(tweet)
  filtered_words = [word for word in tweet_tokens if word not in stop_words]

  #stemming
  ps = PorterStemmer()
  stemmed_words = [ps.stem(w) for w in filtered_words]

  #lemmatizing
  lemmatizer = WordNetLemmatizer()
  lemma_words = [lemmatizer.lemmatize(w,pos = 'a') for w in stemmed_words]

  return " ".join(lemma_words)

preprocess_tweet_text("http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D")  

'awww that bummer shoulda got david carr third day'

In [10]:
tweets_list = tweet_dataset['tweet'].tolist() 
polarity_list = tweet_dataset['polarity'].tolist()
positive_tweets = []
negative_tweets = []
for x in range(len(tweets_list)):
  if polarity_list[x]==0:
     negative_tweets.append(word_tokenize(preprocess_tweet_text(tweets_list[x])))
  else: positive_tweets.append(word_tokenize(preprocess_tweet_text(tweets_list[x])))   
print(positive_tweets[0])  

['dragonzord12', 'avail', 'websit', 'thru', 'amazon', 'novemb', 'thank']


In [11]:
#Preparing Data for the Model
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)                     #https://www.nltk.org/_modules/nltk/classify/naivebayes.html

positive_tokens_for_model = get_tweets_for_model(positive_tweets)
negative_tokens_for_model = get_tweets_for_model(negative_tweets)

In [12]:
#Splitting the dataset for Testing and Training
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [13]:
#Building and Testing the Model
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.7192857142857143
Most Informative Features
                    suck = True           Negati : Positi =     18.0 : 1.0
                    hurt = True           Negati : Positi =     15.0 : 1.0
                     sad = True           Negati : Positi =     13.4 : 1.0
                    fell = True           Negati : Positi =     10.8 : 1.0
                 perfect = True           Positi : Negati =     10.6 : 1.0
                     die = True           Negati : Positi =     10.5 : 1.0
                    sick = True           Negati : Positi =     10.1 : 1.0
                 special = True           Positi : Negati =      9.9 : 1.0
                    fail = True           Negati : Positi =      9.6 : 1.0
                     idk = True           Negati : Positi =      9.5 : 1.0
None


In [14]:
# Testing

custom_tweet = "I welcomed them with a sad heart"
custom_tweet = preprocess_tweet_text(custom_tweet)
custom_tokens = word_tokenize(custom_tweet)

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative
