In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
TRAINING_DATA_FILE = "./dataSet/training/training.1600000.processed.noemoticon.csv"
CLEANED_TRAINING_DATA_FILE = "./dataSet/training/cleaned.training.temp.csv"
TOKEN_FREQUENCY_FILE = "./dataSet/training/token.frequency.csv"

In [1]:
# Training (http://help.sentiment140.com/for-students/)

cols = ['sentiment','id','date','query_string','user','text']
df = pd.read_csv(TRAINING_DATA_FILE,header=None, names=cols, encoding='latin-1')
df.head()
# df.sentiment.value_counts()

Unnamed: 0,sentiment,id,date,query_string,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
df.drop(['id','date','query_string','user'],axis=1,inplace=True)

# Data Preparation
df['pre_clean_len'] = [len(t) for t in df.text]

In [3]:
# Data Dictionary — first draft
from pprint import pprint
data_dict = {
    'sentiment':{
        'type':df.sentiment.dtype,
        'description':'sentiment class - 0:negative, 1:positive'
    },
    'text':{
        'type':df.text.dtype,
        'description':'tweet text'
    },
    'pre_clean_len':{
        'type':df.pre_clean_len.dtype,
        'description':'Length of the tweet before cleaning'
    },
    'dataset_shape':df.shape
}
data_dict

{'sentiment': {'type': dtype('int64'),
  'description': 'sentiment class - 0:negative, 1:positive'},
 'text': {'type': dtype('O'), 'description': 'tweet text'},
 'pre_clean_len': {'type': dtype('int64'),
  'description': 'Length of the tweet before cleaning'},
 'dataset_shape': (1600000, 3)}

In [4]:
# from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
#     soup = BeautifulSoup(text, 'lxml')
#     souped = soup.get_text()
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

testing = df.text[:100]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['awww that s a bummer you shoulda got david carr of third day to do it d',
 'is upset that he can t update his facebook by texting it and might cry as a result school today also blah',
 'i dived many times for the ball managed to save the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 'no it s not behaving at all i m mad why am i here because i can t see you all over there',
 'not the whole crew',
 'need a hug',
 'hey long time no see yes rains a bit only a bit lol i m fine thanks how s you',
 'k nope they didn t have it',
 'que me muera',
 'spring break in plain city it s snowing',
 'i just re pierced my ears',
 'i couldn t bear to watch it and i thought the ua loss was embarrassing',
 'it it counts idk why i did either you never talk to me anymore',
 'i would ve been the first but i didn t have a gun not really though zac snyder s just a doucheclown',
 'i wish i got to watch it with you i miss you and how was the premiere',
 'hollis death scene will hurt

In [5]:
nums = [0,400000,800000,1200000,1600000]
print("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(len(nums) - 1):
    for j in range(nums[i], nums[i + 1]):
        if((j + 1) % 20000 == 0):
            print("Tweets %d of %d has been processed" % (j + 1, nums[-1]))                                                                  
        clean_tweet_texts.append(tweet_cleaner(df['text'][j]))

Cleaning and parsing the tweets...

Tweets 20000 of 1600000 has been processed
Tweets 40000 of 1600000 has been processed
Tweets 60000 of 1600000 has been processed
Tweets 80000 of 1600000 has been processed
Tweets 100000 of 1600000 has been processed
Tweets 120000 of 1600000 has been processed
Tweets 140000 of 1600000 has been processed
Tweets 160000 of 1600000 has been processed
Tweets 180000 of 1600000 has been processed
Tweets 200000 of 1600000 has been processed
Tweets 220000 of 1600000 has been processed
Tweets 240000 of 1600000 has been processed
Tweets 260000 of 1600000 has been processed
Tweets 280000 of 1600000 has been processed
Tweets 300000 of 1600000 has been processed
Tweets 320000 of 1600000 has been processed
Tweets 340000 of 1600000 has been processed
Tweets 360000 of 1600000 has been processed
Tweets 380000 of 1600000 has been processed
Tweets 400000 of 1600000 has been processed
Tweets 420000 of 1600000 has been processed
Tweets 440000 of 1600000 has been processed


In [6]:
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df['target'] = df.sentiment
clean_df.head()

Unnamed: 0,text,target
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [7]:
clean_df.to_csv('clean_tweet.csv',encoding='utf-8')
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

  mask |= (ar1 == a)


Unnamed: 0,text,target
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [8]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
text      1596804 non-null object
target    1600000 non-null int64
dtypes: int64(1), object(1)
memory usage: 36.6+ MB


In [9]:
my_df[my_df.isnull().any(axis=1)].head()

Unnamed: 0,text,target
208,,0
249,,0
398,,0
430,,0
1011,,0


In [10]:
# Drop data with empty text
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596804 entries, 0 to 1596803
Data columns (total 2 columns):
text      1596804 non-null object
target    1596804 non-null int64
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [11]:
my_df.to_csv(CLEANED_TRAINING_DATA_FILE,encoding='utf-8')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer()
cvec.fit(my_df.text)

In [None]:
len(cvec.get_feature_names())

In [None]:
neg_doc_matrix = cvec.transform(my_df[my_df.target == 0].text)
pos_doc_matrix = cvec.transform(my_df[my_df.target == 1].text)
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()

In [None]:
document_matrix = cvec.transform(my_df.text)

In [None]:
my_df[my_df.target == 0].tail()

In [None]:
%%time
neg_batches = np.linspace(0,798179,100).astype(int)
i=0
neg_tf = []
while i < len(neg_batches)-1:
    batch_result = np.sum(document_matrix[neg_batches[i]:neg_batches[i+1]].toarray(),axis=0)
    neg_tf.append(batch_result)
    if (i % 10 == 0) | (i == len(neg_batches)-2):
        print neg_batches[i+1],"entries' term freuquency calculated"
    i += 1

In [None]:
my_df.tail()

In [None]:
%%time
pos_batches = np.linspace(798179,1596019,100).astype(int)
i=0
pos_tf = []
while i < len(pos_batches)-1:
    batch_result = np.sum(document_matrix[pos_batches[i]:pos_batches[i+1]].toarray(),axis=0)
    pos_tf.append(batch_result)
    if (i % 10 == 0) | (i == len(pos_batches)-2):
        print pos_batches[i+1],"entries' term freuquency calculated"
    i += 1

In [None]:

neg = np.sum(neg_tf,axis=0)
pos = np.sum(pos_tf,axis=0)
term_freq_df = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
term_freq_df.head()

In [None]:

term_freq_df.columns = ['negative', 'positive']
term_freq_df['total'] = term_freq_df['negative'] + term_freq_df['positive']
term_freq_df.sort_values(by='total', ascending=False).iloc[:10]

In [None]:
len(term_freq_df)

In [None]:
term_freq_df.to_csv('term_freq_df.csv',encoding='utf-8')