In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import string

In [5]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv")

In [6]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7434 non-null   int64 
 1   text    7434 non-null   object
 2   target  7434 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 232.3+ KB


In [7]:
import io
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(tweets['text'], tweets['target'], test_size = 0.25, random_state = 123)

### Fichur Inginierin


In [11]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
#Agrego longitud de tweet en caracteres
tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length
0,1,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,Forest fire near La Ronge Sask. Canada,1,38
2,5,All residents asked to 'shelter in place' are ...,1,133
3,6,"13,000 people receive #wildfires evacuation or...",1,65
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88


In [14]:
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5


In [15]:
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16


In [16]:
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15


In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0


In [19]:
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7


In [26]:
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6,1
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0,1
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11,3
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1,2
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7,2


In [27]:
#tweets_with_mentions = tweets.loc[tweets['text'].str.contains('@'), ['text', 'target']]
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()

In [33]:
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6,1,0
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0,1,0
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11,3,0
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1,2,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7,2,0


In [36]:
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6,1,0,1
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0,1,0,0
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11,3,0,0
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1,2,0,1
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7,2,0,2


In [38]:
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,text_without_stopwords
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6,1,0,1,Our Deeds Reason May ALLAH Forgive us
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0,1,0,0,Forest fire near La Ronge Canada
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11,3,0,0,All residents asked notified No evacuation she...
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1,2,0,1,people receive evacuation orders California
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7,2,0,2,Just got sent photo Ruby smoke pours school


In [76]:
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics.head()

Unnamed: 0,id,text,target,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,text_without_stopwords,longest_word_length_without_stopwords
0,1,Our Deeds are the Reason of this #earthquake M...,1,69,4.384615,13,13,0.2732,6,1,0,1,Our Deeds Reason May ALLAH Forgive us,7
1,4,Forest fire near La Ronge Sask. Canada,1,38,4.571429,7,7,-0.34,0,1,0,0,Forest fire near La Ronge Canada,6
2,5,All residents asked to 'shelter in place' are ...,1,133,5.090909,22,20,-0.296,11,3,0,0,All residents asked notified No evacuation she...,10
3,6,"13,000 people receive #wildfires evacuation or...",1,65,7.125,8,8,0.0,1,2,0,1,people receive evacuation orders California,10
4,7,Just got sent this photo from Ruby #Alaska as ...,1,88,4.5,16,15,0.0,7,2,0,2,Just got sent photo Ruby smoke pours school,6
