### Import libraries

In [None]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import copy

### Read in datasets

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv", dtype={'Id': object})

In [None]:
train.head(5)

In [None]:
test.head(5)

### Basic statistics

In [None]:
len(train)

In [None]:
len(test)

There are a lot of empty lines in test dataset. Let's remove them.

In [None]:
test = test.dropna()

In [None]:
len(test)

There are a lot of "Not Available" comments in both sets. Remove them from the training set. Comments from test dataset will be assigned to majority class among these comments.

In [None]:
train_na = train[train.Tweet == 'Not Available']
train = train[train.Tweet != 'Not Available']

In [None]:
len(train)

In [None]:
print("NA positive", len(train_na[train_na.Category == "positive"]))
print("NA neutral", len(train_na[train_na.Category == "neutral"]))
print("NA negative", len(train_na[train_na.Category == "negative"]))

Most of comments are positive, so test comments will be assigned to this class.

In [None]:
train['Category'].value_counts()

In [None]:
train = train[train.Category != 'Tweet']
train['Category'].value_counts()

### Comments cleaning

In [None]:
def remove_unwanted(wordlist):
    '''Removes from wordlist links and @ tags'''
    for i, word in reversed(list(enumerate(wordlist))):
        if word.startswith('@'):
            #del wordlist[i+1]
            del wordlist[i]
        if word.startswith('http') or word.startswith('www'):
            #del wordlist[i+1]
            del wordlist[i]
    return wordlist

def clean_comments(train_dataset):
    
    train_cleaned = copy.deepcopy(train_dataset)
    print("Copied")
    
    # split text into list of words
    train_cleaned['splited'] = train_cleaned.apply(lambda row: row['Tweet'].split(), axis=1)
    #train_cleaned['splited'] = train_cleaned.apply(lambda row: word_tokenize(row['Tweet']), axis=1)
    print("Splited")
    
    # Cleaning words from links and tags
    train_cleaned['cleaned'] = train_cleaned.apply(lambda row: remove_unwanted(row['splited']), axis=1)
    print("Cleaned")
    
    # Merging splited words from list to string
    train_cleaned['cleaned_string'] = train_cleaned.apply(lambda row: ' '.join(row['splited']), axis=1)
    print("Joined")
    
    #train_cleaned['tokenized'] = train_cleaned.apply(lambda row: word_tokenize(row['cleaned_string']), axis=1)
    #train_cleaned['ready'] = train_cleaned.apply(lambda row: ' '.join(row['tokenized']), axis=1)
    
    #Get 'empty' comments - which probably doesn't mean anything
    empties = train_cleaned.loc[train_cleaned['cleaned_string']=='']
    
    #Remove these empty comments from set
    train_cleaned = train_cleaned.drop(empties.index)
    print("Deleted empties")
    
    return train_cleaned

def clean_test(train_dataset):
    
    train_cleaned = copy.deepcopy(train_dataset)
    print("Copied")
    
    # split text into list of words
    train_cleaned['splited'] = train_cleaned.apply(lambda row: row['Tweet'].split(), axis=1)
    #train_cleaned['splited'] = train_cleaned.apply(lambda row: word_tokenize(row['Tweet']), axis=1)
    print("Splited")
    
    # Cleaning words from links and tags
    train_cleaned['cleaned'] = train_cleaned.apply(lambda row: remove_unwanted(row['splited']), axis=1)
    print("Cleaned")
    
    # Merging splited words from list to string
    train_cleaned['cleaned_string'] = train_cleaned.apply(lambda row: ' '.join(row['splited']), axis=1)
    print("Joined")
    
    return train_cleaned

In [None]:
def export_frame(dataframe, filename):
    dataframe.to_csv(filename, sep=';', encoding='utf-8', index=False, float_format='%.0f')

In [None]:
train = clean_comments(train)
test = clean_comments(test)

In [None]:
train.head(5)

In [None]:
mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
train = train.replace({'Category': mapping})
test = test.replace({'Category': mapping})

In [None]:
train_exp = train[['Id','Category', 'cleaned_string']]
test_exp = test[['Id', 'cleaned_string']]

In [None]:
train_exp.head(5)

In [None]:
test_exp.head(5)

In [None]:
#test_exp.Id = test_exp.Id.astype(object, copy=False)

In [None]:
export_frame(train_exp, "train_clean.csv")
export_frame(test_exp, "test_clean.csv")

In [None]:
train_exp.dtypes