## Data importing function

In [45]:
import csv
import codecs
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import wordnet as wn

def load_dataset(path_to_file):
    X, y = [], []
    with codecs.open(path_to_file, "r",encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader, None) # Skip header
        for row in reader:
            y.append(int(row[1]))
            X.append(row[2])
    return X, y

## Loading data set

In [4]:
x, y = load_dataset('data/train.csv')
x = np.array(x)
y = np.array(y)

indices = np.random.choice(range(100000), 10000, replace=False)

y_series = pd.Series(y, name='labels')
X_series = pd.Series(x, name='data')

X_train = X_series[indices]
y_train = y_series[indices]
print(X_train[:50])

64774    @BradtheGleek It's been dead to you? Not here....
43321    @AlexReside  Ooops.. I had a bug   But I fixed...
44030                              @Anistorm Whats wrong? 
45942            @alittletrendy sad  where's your laptop?!
71782    @cakey oh that's good, not bad thanks altho no...
62719    @bikerchick22 what's up hottie!  great to see ...
84527    @Cattington No, not yet,.... lots of wind and ...
51064                @asyeasyeasye haloo too  how are you?
67435    @brookandthecity Lol, oh ok. Just making sure ...
99784    @Crystal_ESPN Ah! OK! Wanted to make sure! Hil...
22086    @_Larissa_ HAHa normally yeah BUT when its PP ...
12277    -&gt; @glynmoody my poor Spanish leaves me com...
70442    @breezyballababe oh awwwww dats how ima feel i...
69633                         @andisherwood I like that.  
83114    @AshPash Awesome job on getting up and doing y...
81036                           @asdfology What about me. 
78646    @cecedesouza If u need help in the 7osa oo loy.

## First 5 rows of data and labels

In [5]:
print(X_train.head(5))

64774    @BradtheGleek It's been dead to you? Not here....
43321    @AlexReside  Ooops.. I had a bug   But I fixed...
44030                              @Anistorm Whats wrong? 
45942            @alittletrendy sad  where's your laptop?!
71782    @cakey oh that's good, not bad thanks altho no...
Name: data, dtype: object


## First solution
Out first approach is to hash every tweet and check if there's some correlation between !@#!#!@

In [33]:
import hashlib

z = [int(hashlib.sha1(string.encode('utf-8')).hexdigest(), 16) for string in x]
z = np.array(z)

In [34]:
print(z[0])

1021594023517841287183518356032487246803893750680


In [123]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# spliting the data into train/test

X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.2, random_state=23)

logreg = LogisticRegression()
logreg.fit(X_train.reshape(-1, 1), y_train)

y_pred = logreg.predict(X_test.reshape(-1, 1))
y_diff = y_pred == y_test

In [36]:
cnt = 0
for qwe in y_diff:
    if qwe:
        cnt += 1
print(cnt/len(y_diff))

0.5622062206220622


# Second and serious solution

In [23]:
X_red = pd.Series(X_train)
y_red = pd.Series(y_train)
# X_red = pd.Series(x[10000:20000])
# y_red = pd.Series(y[10000:20000])

In [24]:
print(type(X_red[indices[0]]))
print(indices)

<class 'str'>
[64774 43321 44030 ..., 62022 71362 33217]


## Tasks:
- Data cleaning
    - Delete links
    - Delete mentions
    - Remove punctation
    - All lowercase
    - 

In [25]:
import data.spellcheck
data.spellcheck.correction('radras')
data.spellcheck.correction('mi')

'mi'

In [26]:
import html
print(html.unescape('haha omg. stayin steezy &amp; mowin the lawn... loviie still here &amp;&amp; goiing tanning soon.'))

haha omg. stayin steezy & mowin the lawn... loviie still here && goiing tanning soon.


In [28]:
# Removing twitter mentions
for i in indices:
    X_red[i] = re.sub(r'@\w+', " ", str(X_red[i]))
    
# Converting HTML codes to plain text
for i in indices:
    X_red[i] = html.unescape(X_red[i])
for i in range(10):
    print(X_red[indices[i]])


 It's been dead to you? Not here. My friend tried making an account, and when he did, 2 minutes later his page didn't exist 
  Ooops.. I had a bug   But I fixed it..    Try that tiny URL again..
 Whats wrong? 
 sad  where's your laptop?!
 oh that's good, not bad thanks altho not wanting to go to work 
 what's up hottie!  great to see you tweetin in!!! 
 No, not yet,.... lots of wind and rain and cold  How are you my friend?
 haloo too  how are you?
 Lol, oh ok. Just making sure i was still in tuned wit dat good ol' ol skool muzik! 
 Ah! OK! Wanted to make sure! Hilarious. Yeah, she's the homie alright. But she better watch out. I know way too much, 


In [29]:
wn.synsets(data.spellcheck.correction('korrekt'))

[Synset('correct.v.01'),
 Synset('right.v.01'),
 Synset('chastise.v.01'),
 Synset('compensate.v.01'),
 Synset('discipline.v.02'),
 Synset('decline.v.06'),
 Synset('adjust.v.01'),
 Synset('correct.v.08'),
 Synset('correct.a.01'),
 Synset('correct.s.02'),
 Synset('correct.s.03'),
 Synset('right.a.05')]

In [30]:
from nltk.tokenize import sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from data.utils import *
import autocorrect

# forms list of sentences per every tweet
all_words = [regexp_tokenize(red, "[\w']+") for red in X_red]

for i in range(len(all_words)):
    for j in range(len(all_words[i])):
        all_words[i][j] = remove_more_than_two_duplicate_letters(all_words[i][j].lower())
print("finished")

# clean the set from the numeric values
for i in range(len(all_words)):
    all_words[i] = [word for word in all_words[i] if not word.isnumeric()]
    all_words[i] = [word for word in all_words[i] if word not in stopwords.words('english')]

for i in range(10):
    print(all_words[i])
    
print()
print('LEN:', len(stopwords.words('english')))
print(stopwords.words('english'))


#.. Omgaga. Im sooo  im gunna CRy. 
#I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...


# for i in range(len(all_sentences)):
#     for j in range(all_sentences[i]):
#         all_sentences[i][j] = regexp_tokenize(sentence, "[\w']+")

finished
['dead', 'friend', 'tried', 'making', 'account', 'minutes', 'later', 'page', 'exist']
['oops', 'bug', 'fixed', 'try', 'tiny', 'url']
['whats', 'wrong']
['sad', "where's", 'laptop']
['oh', "that's", 'good', 'bad', 'thanks', 'altho', 'wanting', 'go', 'work']
["what's", 'hottie', 'great', 'see', 'tweetin']
['yet', 'lots', 'wind', 'rain', 'cold', 'friend']
['haloo']
['lol', 'oh', 'ok', 'making', 'sure', 'still', 'tuned', 'wit', 'dat', 'good', "ol'", 'ol', 'skool', 'muzik']
['ah', 'ok', 'wanted', 'make', 'sure', 'hilarious', 'yeah', 'homie', 'alright', 'better', 'watch', 'know', 'way', 'much']

LEN: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am'

In [31]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
for i in range(len(all_words)):
    for j in range(len(all_words[i])):
        all_words[i][j] = porter.stem(all_words[i][j])
        
for i in range(10):
    print(all_words[i])

['dead', 'friend', 'tri', 'make', 'account', 'minut', 'later', 'page', 'exist']
['oop', 'bug', 'fix', 'tri', 'tini', 'url']
['what', 'wrong']
['sad', "where'", 'laptop']
['oh', "that'", 'good', 'bad', 'thank', 'altho', 'want', 'go', 'work']
["what'", 'hotti', 'great', 'see', 'tweetin']
['yet', 'lot', 'wind', 'rain', 'cold', 'friend']
['haloo']
['lol', 'oh', 'ok', 'make', 'sure', 'still', 'tune', 'wit', 'dat', 'good', "ol'", 'ol', 'skool', 'muzik']
['ah', 'ok', 'want', 'make', 'sure', 'hilari', 'yeah', 'homi', 'alright', 'better', 'watch', 'know', 'way', 'much']


In [32]:
allz_words = []
for lista in all_words:
    allz_words.append(" ".join(lista))

print(allz_words[:5])

['dead friend tri make account minut later page exist', 'oop bug fix tri tini url', 'what wrong', "sad where' laptop", "oh that' good bad thank altho want go work"]


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(allz_words)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

y_nov = []
for i in indices:
    y_nov.append(y_red[i])

In [44]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(train_data_features, y_nov, test_size=0.2, random_state=23)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_diff = y_pred == y_test

print(len(train_data_features))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [119]:
cnt = 0
for qwe in y_diff:
    if qwe:
        cnt += 1
print(cnt/len(y_diff))

0.625


In [66]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=101)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_diff = y_pred == y_test


cnt = 0
for qwe in y_diff:
    if qwe:
        cnt += 1
print(cnt/len(y_diff))


0.636


In [73]:
import numpy as np
indices = np.random.choice(range(100000), 1000, replace=False)


y_series = pd.Series(y)
X_series = pd.Series(x)

X_train = X_series[indices]
print(X_train[:50])


21766   NaN
31167   NaN
97526   NaN
37538   NaN
58234   NaN
22041   NaN
22256   NaN
29630   NaN
72726   NaN
53980   NaN
55329   NaN
39064   NaN
29649   NaN
25675   NaN
20126   NaN
59010   NaN
53061   NaN
14594   NaN
53302   NaN
83047   NaN
13809   NaN
62194   NaN
73472   NaN
32456   NaN
16359   NaN
74733   NaN
11300   NaN
12952   NaN
21527   NaN
22394   NaN
56750   NaN
40941   NaN
76221   NaN
88124   NaN
8948    NaN
22664   NaN
69776   NaN
71499   NaN
19327   NaN
65972   NaN
10189   NaN
48833   NaN
78504   NaN
8345    NaN
59089   NaN
52461   NaN
29548   NaN
32516   NaN
72157   NaN
26261   NaN
dtype: float64
