In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
import string
from numpy import loadtxt

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

pd.set_option('display.max_colwidth',100)

wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# A word that is so common that there is no need to use it in a search
ENGLISH_STOP_WORDS = stopwords.words('english')

# Adding few extra stop word
ENGLISH_STOP_WORDS = ENGLISH_STOP_WORDS + ['im', 'dont','dunno', 'cant',"'s", 'u', 'x','user','url','rt','lol', '<user>', '<url>']

In [4]:
# Calculate the most common words used in the set of all tweets
def get_most_common_words(txt,limit):
    return Counter(txt.split()).most_common()[:limit]

# Remove from tweets the punctuation and stop words (= a word that is so common that there is no need to use it in a search.)
def clean_tweet(tweet):
    tweet = "".join([w for w in tweet if w not in string.punctuation])
    tokens = re.split('\W+', tweet)
    tweet = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tweet

# Change any word belonging to the same word-family into a common word (changing/changes/changed.. ==> change)
def lemmatization(token_tweet):
    tweet = [wn.lemmatize(word) for word in token_tweet]
    return tweet

# Concatenate the tokennized tweet into a all text like at the beginning
def concatenate(lst):
    concatenate_tweet = ''
    for elem in lst:
        concatenate_tweet = concatenate_tweet + ' ' + elem
    return concatenate_tweet

def remove_digit(txt):
    txt = ''.join([i for i in txt if not i.isdigit()])
    return txt


def clean_data(df):
    print("Inside clean_data")
    df['text'] = df['text'].apply(lambda x : clean_tweet(x))
    print("Clean_tweet DONE")
    df['text'] = df['text'].apply(lambda x : lemmatization(x))
    print("Lemmatization DONE")
    df['text'] = df['text'].apply(lambda x : concatenate(x))
    print("Concatenate DONE")
    # df['text'] = df['text'].apply(lambda x : clean_tweet(x))
    # print("Second clean tweet DONE")
    # df['text'] = df['text'].apply(lambda x : concatenate(x))
    # print("Second concatenate DONE")
    # df['text'] = df['text'].apply(lambda x : remove_digit(x))
    # print("Remove digit DONE")
    return df

# export the dataframe to a csv file
def export_to_csv(df, filename):
    df.to_csv(filename, index=False)

In [8]:
DATA_FOLDER = "../data/"

POSITIVE_DATASET = DATA_FOLDER+"train_pos.txt"
NEGATIVE_DATASET = DATA_FOLDER+"train_neg.txt"

TEST_DATASET = DATA_FOLDER+"test_data.txt"

pos_data = pd.read_fwf(POSITIVE_DATASET, header=None, names=["text"]).drop_duplicates()
pos_data["labels"] = 1
neg_data = pd.read_fwf(NEGATIVE_DATASET, header=None, names=["text"]).drop_duplicates()
neg_data["labels"] = 0


df_test = loadtxt(TEST_DATASET, delimiter=",", dtype=str, usecols=1)
test_data = pd.DataFrame(df_test, columns=['text']).drop_duplicates()

# train_data = pd.concat([pos_data, neg_data], ignore_index=True)

pos_data_cleaned = clean_data(pos_data)
neg_data_cleaned = clean_data(neg_data)

test_data_cleaned = clean_data(test_data)

Inside clean_data
Clean_tweet DONE
Lemmatization DONE
Concatenate DONE
Inside clean_data
Clean_tweet DONE
Lemmatization DONE
Concatenate DONE


In [10]:
# export data to csv

pos_data_cleaned.to_csv('../data/pos_data_cleaned.csv')
neg_data_cleaned.to_csv('../data/neg_data_cleaned.csv')

test_data_cleaned.to_csv('../data/test_data_cleaned.csv')