In [95]:
import pandas as pd
import re
import functions
import nltk
import sklearn
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer

In [2]:
data = pd.read_csv("train_posts.csv", header=None)

In [96]:
porter = PorterStemmer()

In [52]:
imax = 30000
imin = 25000

In [126]:
preprocessed = []
repeat = 0
for i, tweet in enumerate(data[0].values[:imax]):
    tmp = tweet.lower()
    tmp = re.sub("[0-9]{1,2}/[0-9]{1,2}/([0-9]{4}|[0-9]{2})|([0-9]{4}|[0-9]{2})/[0-9]{1,2}/[0-9]{1,2}|[0-9]{2}/[0-9]{2}", "DATE", tmp)
    tmp = re.sub("[0-9]{2}(:[0-9]{2}){1,2}( ?(am|pm|p\.m\.|a\.m\.))?", "TIME", tmp)
    tmp = re.sub("[0-9]+", " nb ", tmp)
    tmp = tmp.encode(encoding='ascii', errors='ignore').decode()
    # get rid of the non ascii characters
    tmp = re.sub("\.{2,}", "...", tmp)
    for c in ['!', '-', ',']:
        tmp = re.sub("{}+".format(c), c, tmp)
    tmp = re.sub("\?+", "?", tmp)
    tmp = re.sub("`+", "'", tmp)
    tmp = re.sub("'{2,}", "'", tmp)
    tmp = tmp.translate(str.maketrans(dict.fromkeys('#*+/<=>@[\\]^_`{|}~'))) #removing all the other special characters
    tokens = [t if t not in ["''", "``"] else '"' for t in nltk.tokenize.word_tokenize(tmp) ]
    # tokenizing using nltk.word_tokenize. althought it transforms '"' into '``' or "''" and this is a behaviour do not want
    # so we make sure that the '"' are changed to their original form
    for i, token in enumerate(tokens):
        if re.search(r"([a-z])\1{2,}", token):
            repeat += 1
            # cleaning the words containing a letter repeated 3 times or more, using the list of the Ensglish words
            tokens[i] = functions.clean(token)
        tokens[i] = porter.stem(tokens[i])
    preprocessed.append(tokens)
print("{} words with repetitions cleaned".format(repeat))

6566 words with repetitions cleaned


In [134]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=0.0001, stop_words='english', ngram_range=(1,2))

In [135]:
X = vectorizer.fit_transform([' '.join(post) for post in preprocessed[:]])
Y_train = data[1].values[:imin]
#X_test = vectorizer.transform([' '.join(post) for post in preprocessed[imin:]])
X_train = X[:imin]
X_test = X[imin:]
Y_test =  data[1].values[imin:imax]

In [136]:
len(vectorizer.get_feature_names())

149526

In [137]:
clf = LogisticRegression(C=5,class_weight='balanced', solver='newton-cg',multi_class='multinomial', n_jobs=-1,\
                         random_state=40, verbose=1, penalty='l2')
clf.fit(X_train, Y_train)    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.9s finished


LogisticRegression(C=5, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=40, solver='newton-cg', tol=0.0001, verbose=1,
                   warm_start=False)

In [138]:
clf.score(X_train,Y_train)

0.98916

In [139]:
clf.score(X_test, Y_test)

0.632

In [133]:
print(vectorizer.get_feature_names())

['aa', 'aa meet', 'aac', 'aag', 'aah', 'aahh', 'aan', 'aar', 'aardvark', 'aaron', 'aaron carter', 'aaron sorkin', 'aaron spell', 'aaww', 'aay', 'aaya', 'ab', 'aba', 'aback', 'abacu', 'abad', 'abandon', 'abandon issu', 'abandon thi', 'abb', 'abba', 'abbey', 'abbi', 'abbi got', 'abbi wa', 'abbott', 'abbrevi', 'abby', 'abc', 'abc affili', 'abc nb', 'abc news', 'abc onli', 'abck', 'abd', 'abdomen', 'abdomin', 'abduct', 'abdul', 'abe', 'abercrombi', 'abercrombi chick', 'abercrombi fitch', 'aberdeen', 'aberr', 'abhi', 'abhor', 'abhorr', 'abi', 'abi friend', 'abi gavin', 'abi nb', 'abid', 'abil', 'abil accept', 'abil ask', 'abil commun', 'abil connect', 'abil creat', 'abil develop', 'abil good', 'abil make', 'abil peopl', 'abil play', 'abil read', 'abil realli', 'abil speak', 'abil tell', 'abil use', 'abil win', 'abil work', 'abit', 'abject', 'abl', 'abl access', 'abl accomplish', 'abl achiev', 'abl add', 'abl afford', 'abl ani', 'abl anoth', 'abl answer', 'abl anyth', 'abl appreci', 'abl ask