In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
import pandas as pd

lemmatizer = WordNetLemmatizer()

'''
polarity 0 = negative. 2 = neutral. 4 = positive.
id
date
query
user
tweet
'''

'\npolarity 0 = negative. 2 = neutral. 4 = positive.\nid\ndate\nquery\nuser\ntweet\n'

In [2]:
def init_process(fin,fout): # output lots of "[0,1]:::tweet content", unless the tweet is neutral
    outfile = open(fout,'a') # append to end of fout file
    with open(fin, buffering=200000, encoding='latin-1') as f: # 200KB buffer
        try:
            for line in f:
                line = line.replace('"','')
                initial_polarity = line.split(',')[0]
                if initial_polarity == '0':
                    initial_polarity = [1,0]
                elif initial_polarity == '4': 
                    initial_polarity = [0,1]
                # the code doesn't deal with neutral (2) tweets?

                tweet = line.split(',')[-1]
                outline = str(initial_polarity)+':::'+tweet
                outfile.write(outline)
        except Exception as e:
            print(str(e))
    outfile.close()

In [3]:
init_process('./stanford_sentiment_tweets/training.1600000.processed.noemoticon.csv','train_set.csv')
init_process('./stanford_sentiment_tweets/testdata.manual.2009.06.14.csv','test_set.csv')

#### test_set contains more negative sentiment data than positive, roughly 3:1

In [4]:
def create_lexicon(fin): # lexicon will look like this: ['words', 'order', 'lexicon', 'in', 'random']
    lexicon = []
    with open(fin, 'r', buffering=100000, encoding='latin-1') as f:
        try:
            counter = 1
            for line in f: # sample 1 line for every 2500 lines, maybe using linecache module would be better?
                counter += 1
                if (counter%2500) is 0:
                    content = ''
                    tweet = line.split(':::')[1]
                    content += ' '+tweet
                    words = word_tokenize(content) # ['this','is','a','tweet']
                    words = [lemmatizer.lemmatize(i) for i in words]
                    lexicon = list(set(lexicon + words)) # words already in lexicon are not repeatedly added
                    print('after line', counter, 'lexicon now has', len(lexicon), ' words')
                    print('content is ', content)

        except Exception as e:
            print(str(e))

    with open('lexicon.pickle','wb') as f: # write in binary mode
        pickle.dump(lexicon,f)

In [5]:
create_lexicon('train_set.csv') # TODO: the tutorial code is wrong, repeats training data 3 times...

after line 2500 lexicon now has 26  words
content is   @Splont   I have a meeting all morning today if that makes you feel better ? Though I finish tomorrow until next weds to make you hate me!

after line 5000 lexicon now has 32  words
content is   trying to keep my eyes open..damn baking 

after line 7500 lexicon now has 41  words
content is   ahh Juicy fruit gum lose their flavour sooo quickly 

after line 10000 lexicon now has 63  words
content is   CRAP! After looking when I last tweeted... WHY AM I UP SO EARLY. It's 10! I shouldn't be up until like... 12. 

after line 12500 lexicon now has 85  words
content is   @diannabee it's not as painful as b4...but that ugly bruise is still there   and Idk how to do that @ thing on my phone b/c i'm dumb 

after line 15000 lexicon now has 89  words
content is   really miss talking to missy 

after line 17500 lexicon now has 91  words
content is   missing my boyfriend 

after line 20000 lexicon now has 108  words
content is   i hv to READ the

In [None]:
def convert_to_vec(fin,fout,lexicon_pickle):
    with open(lexicon_pickle,'rb') as f:
        lexicon = pickle.load(f)
    outfile = open(fout,'a')
    with open(fin, buffering=20000, encoding='latin-1') as f:
        counter = 0
        for line in f:
            counter +=1
            label = line.split(':::')[0]
            tweet = line.split(':::')[1]
            current_words = word_tokenize(tweet.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]

            features = np.zeros(len(lexicon))

            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # OR DO +=1, test both
                    features[index_value] += 1

            features = list(features)
            outline = str(features)+'::'+str(label)+'\n'
            outfile.write(outline)

        print(counter)

In [None]:
convert_to_vec('test_set.csv','processed-test-set.csv','lexicon-2500-2638.pickle')

In [None]:
def shuffle_data(fin):
    df = pd.read_csv(fin, error_bad_lines=False)
    df = df.iloc[np.random.permutation(len(df))]
    print(df.head())
    df.to_csv('train_set_shuffled.csv', index=False)

In [None]:
shuffle_data('train_set.csv')

In [15]:
def create_test_data_pickle(fin): # what is the purpose of this function... it doesn't ouput anything?

    feature_sets = []
    labels = []
    counter = 0
    with open(fin, buffering=20000) as f:
        for line in f:
            try:
                features = list(eval(line.split(':::')[1])) # shouldn't it be ':::'?
                label = list(eval(line.split(':::')[0]))

                feature_sets.append(features)
                labels.append(label)
                counter += 1
            except:
                pass
    print(counter)
    feature_sets = np.array(feature_sets)
    labels = np.array(labels)

0


In [None]:
create_test_data_pickle('processed-test-set.csv')