In [1]:
import numpy as np
import pandas as pd
import string
#import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/matiascano/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matiascano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matiascano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/matiascano/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

In [None]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [7]:
tweets = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
tweets['keyword'] = tweets.keyword.str.replace('%20',' ')

In [8]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7434 non-null   int64 
 1   keyword   7378 non-null   object
 2   location  4982 non-null   object
 3   text      7434 non-null   object
 4   target    7434 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 348.5+ KB


In [9]:
test['text'].duplicated().value_counts()

False    3243
True       20
Name: text, dtype: int64

### Feature Engineering

In [None]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']

tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)

In [None]:
tweets_metrics.to_csv('train_features.csv', index=False)

In [None]:
test_metrics = test[['id','text']]
test_metrics['text_without_stopwords'] = test_metrics['text'].str.split()
test_metrics['text_without_stopwords'] = test_metrics['text_without_stopwords'].apply(remove_stopword)

test_metrics['length'] = test_metrics['text'].apply(lambda x: len(x))
test_metrics['avg_word_length'] = test_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
test_metrics['amount_of_words'] = test_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = test_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
test_metrics['amount_of_unique_words'] = unique_words_by_tweet
test_metrics['sentiment'] = test_metrics['text'].apply(lambda x: return_sia_compound_values(x))
test_metrics['stopwords_count'] = test_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
test_metrics['punctuation_count'] = test_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = test_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
test_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = test_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
test_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
test_metrics['longest_word_length_without_stopwords'] = test_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
test_metrics['stopword_word_ratio'] = test_metrics['stopwords_count'] / test_metrics['amount_of_words']

test_metrics['adjectives_count'] = test_metrics['text'].apply(get_adjectives)
test_metrics['nouns_count'] = test_metrics['text'].apply(get_nouns)
test_metrics['verbs_count'] = test_metrics['text'].apply(get_verbs)
test_metrics['adverbs_count'] = test_metrics['text'].apply(get_adverbs)

In [None]:
test.to_csv('test_features.csv', index=False)

### Word2Vec

#### Keywords

In [None]:
tweets['keyword'] = tweets['keyword'].fillna('NULL')
test['keyword'] = test['keyword'].fillna('NULL')

In [None]:
keywords = tweets[['keyword', 'id']]
keywords.head()

In [None]:
keyword_tokens = keywords.keyword.unique().tolist()

In [None]:
keyword_tokens[0]

In [None]:
len(keyword_tokens)

In [None]:
from gensim.models import Word2Vec

# Parameters
# sg ({0, 1}, optional) - Training algorithm: 1 for skip-gram; otherwise CBOW.

keyword_vectors = Word2Vec([keyword_tokens], min_count=1, size= 100, workers=3, window =3, sg=1)

In [None]:
# Sanity check
keyword_vectors['ablaze']

In [None]:
to_vector_matrix = {}

for k in keyword_tokens:
    to_vector_matrix[k] = keyword_vectors[k]

In [None]:
keyword_w2v = pd.DataFrame.from_dict(to_vector_matrix).T.reset_index()
keyword_w2v

In [None]:
aux = []
aux.append('keyword')
for i in range (0, 100):
    name = 'v' + str(i)
    aux.append(name)
len(aux)

In [None]:
keyword_w2v.columns = aux

In [None]:
keyword_w2v.head()

In [None]:
keyword_w2v.to_csv('keyword_features.csv', index=False)

#### Locations

In [19]:
locations = pd.read_csv("../TP1/locations.csv", usecols=['location', 'address'])

In [20]:
locations.head()

Unnamed: 0,location,address
0,,
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite..."
2,"melbourne, australia","City of Melbourne, Victoria, Australia"
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V..."
4,alberta,"Alberta, Canada"


In [21]:
locations.loc[0, 'location'] = 'NULL'

In [22]:
locations

Unnamed: 0,location,address
0,,
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite..."
2,"melbourne, australia","City of Melbourne, Victoria, Australia"
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V..."
4,alberta,"Alberta, Canada"
...,...,...
2266,zac newsome loves me,
2267,"zeerust, south africa","Zeerust, Ngaka Modiri Molema District Municipa..."
2268,zero branco,"Zero Branco, Treviso, Veneto, 31059, Italia"
2269,ziam af,


In [23]:
for_location = tweets[['id', 'location']]

In [24]:
for_location['location'] = for_location['location'].map(lambda x: x if x is np.nan else x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
for_location.fillna("NULL", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [26]:
import re 
for_location['location'] = for_location['location'].map(lambda x: x if re.match(r'^([a-zA-Z,\s])*$', x) else 'NULL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
to_vectorize = for_location.merge(locations).loc[:, ['id', 'address']]

In [39]:
to_vectorize.fillna('NULL', inplace=True)

In [40]:
to_vectorize['address'] = to_vectorize['address'].str.replace('\d+', '') # No le gustan los numeros

In [82]:
to_vectorize.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7433 entries, 0 to 7432
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       7433 non-null   int64 
 1   address  7433 non-null   object
dtypes: int64(1), object(1)
memory usage: 174.2+ KB


In [43]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

t = Tokenizer()
t.fit_on_texts(to_vectorize['address'])
vocab_size = len(t.word_index) + 1

In [67]:
t.word_index

{'null': 1,
 'united': 2,
 'of': 3,
 'states': 4,
 'america': 5,
 'county': 6,
 'england': 7,
 'kingdom': 8,
 'new': 9,
 'york': 10,
 'canada': 11,
 'california': 12,
 'london': 13,
 'city': 14,
 'north': 15,
 'west': 16,
 'south': 17,
 'texas': 18,
 'greater': 19,
 'district': 20,
 'de': 21,
 'san': 22,
 'australia': 23,
 'los': 24,
 'angeles': 25,
 'east': 26,
 'india': 27,
 'florida': 28,
 'washington': 29,
 'ontario': 30,
 'and': 31,
 'columbia': 32,
 'carolina': 33,
 'swa': 34,
 'dx': 35,
 'the': 36,
 'francisco': 37,
 'illinois': 38,
 'del': 39,
 'santo': 40,
 'domingo': 41,
 'nigeria': 42,
 'midlands': 43,
 'france': 44,
 'georgia': 45,
 'mumbai': 46,
 'pennsylvania': 47,
 'street': 48,
 'road': 49,
 'colorado': 50,
 'jersey': 51,
 'wellington': 52,
 'park': 53,
 'golden': 54,
 'horseshoe': 55,
 'manchester': 56,
 'ohio': 57,
 'tennessee': 58,
 'buenos': 59,
 'aires': 60,
 'cook': 61,
 'oklahoma': 62,
 'massachusetts': 63,
 'chicago': 64,
 'british': 65,
 'kenya': 66,
 'virginia

In [46]:
# integer encode the documents
encoded_docs = t.texts_to_sequences(to_vectorize['address'])

In [68]:
from keras.preprocessing.sequence import pad_sequences

max_length = 5
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 1  0  0  0  0]
 [ 1  0  0  0  0]
 [ 1  0  0  0  0]
 ...
 [69 20 65 32 11]
 [ 7 34 35  2  8]
 [29  2  4  3  5]]


In [64]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [70]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector