In [0]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
url = 'https://raw.githubusercontent.com/Raizel820/NLP_team_project/master/train.csv'
df = pd.read_csv(url)

In [0]:
df.sample(10)

Unnamed: 0,id,keyword,location,text,target
3878,5515,flattened,Chicago,the road to success is paved with pennies that...,0
4571,6499,injuries,"Saskatchewan, Canada",@jamienye u can't blame it all on coaching man...,0
2077,2983,dead,"Sochi, KDA, RU",@hlportal Hello! I'm looking for mod Cold Ice....,0
4475,6365,hostages,cuba,#hot C-130 specially modified to land in a st...,1
6533,9344,survived,,@TheDailyShow Mahalo nui loa for making my 20s...,0
5044,7192,mudslide,,her cake looks like a mudslide hah,0
5358,7647,panic,"Leeds, United Kingdom",Obligatory middle of the night panic attack,0
642,931,blaze,,Do you know anyone looking to move to Hammond ...,0
6370,9102,suicide%20bomb,,...//..// whao.. Pic of 16yr old PKK suicide b...,1
3751,5330,fire,Canada,I'm On Fire. http://t.co/WATsmxYTVa,0


In [0]:
df.shape

(7613, 5)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [0]:
null_values=df.isna().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
sum_tot=len(df)
null_values['percent']=null_values['null']/sum_tot*100
round(null_values,3).sort_values('percent',ascending=False)

Unnamed: 0,null,percent
location,2533,33.272
keyword,61,0.801
id,0,0.0
text,0,0.0
target,0,0.0


In [0]:
n_location = df.location.unique().size
n_loc_total=df.location.size
print ("Number of unique location:  %i" % n_location)
print ("Number of all location:  %i" % n_loc_total)

Number of unique location:  3342
Number of all location:  7613


As we can see here, almost half of the location values are unique. In addition, 2533 of location are missing values. Therefore, I will drop 'location' column completely.

In [0]:
df= df.drop('location', axis=1)
df.sample(5)

Unnamed: 0,id,keyword,text,target
4722,6714,lava,contemplating going to chilis just to get a mo...,0
5352,7640,pandemonium,@PBohanna Probably a dead boring 1st hour and ...,0
3987,5664,floods,In #India 119000 people have taken shelter in ...,1
3683,5242,fatality,@pxnatosil @RenuncieDilma Fatality!,0
7239,10367,weapons,WWII Memories Plus Lessons of Hiroshima We Sti...,1


In [0]:
df= df.dropna()
df.shape

(7552, 4)

In [0]:
null_values=df.isna().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
sum_tot=len(df)
null_values['percent']=null_values['null']/sum_tot*100
round(null_values,3).sort_values('percent',ascending=False)

Unnamed: 0,null,percent
id,0,0.0
keyword,0,0.0
text,0,0.0
target,0,0.0


# Basic Feature Extraction - 1

Normally, I tried to make data cleaning first. Then, I realized that while making data cleaning, I am losing some of characters that can help data cleaning. Therefore, there will be two part of feature extraction. Here, I will extract features that can't be exracted after data cleaning.

### 1) Number of stopwords

In [0]:

!pip install -q wordcloud
import wordcloud
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['text','stopwords']].head()

Unnamed: 0,text,stopwords
31,@bbcmtd Wholesale Markets ablaze http://t.co/l...,0
32,We always try to bring the heavy. #metal #RT h...,2
33,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,Crying out for more! Set me ablaze,3
35,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [0]:

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['punctuation'] = df['text'].apply(lambda x: count_punct(x))

### 2) Number of hastag characters

One more interesting feature which we can extract from a tweet is calculating the number of hashtags or mentions present in it. This also helps in extracting extra information from our text data.

In [0]:
df['hastags'] = df['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df[['text','hastags']].head()

Unnamed: 0,text,hastags
31,@bbcmtd Wholesale Markets ablaze http://t.co/l...,0
32,We always try to bring the heavy. #metal #RT h...,2
33,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,Crying out for more! Set me ablaze,0
35,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


### 3) Number of numerics
Calculate the number of numerics which are present in the tweets can be useful. At least, it doesn't hurt

In [0]:
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['text','numerics']].head()

Unnamed: 0,text,numerics
31,@bbcmtd Wholesale Markets ablaze http://t.co/l...,0
32,We always try to bring the heavy. #metal #RT h...,0
33,#AFRICANBAZE: Breaking news:Nigeria flag set a...,0
34,Crying out for more! Set me ablaze,0
35,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


### 4) Number of Uppercase words
Anger or rage is quite often expressed by writing in UPPERCASE words which makes this a necessary operation to identify those words.

In [0]:
df['upper'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['text','upper']].head()

Unnamed: 0,text,upper
31,@bbcmtd Wholesale Markets ablaze http://t.co/l...,0
32,We always try to bring the heavy. #metal #RT h...,1
33,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,Crying out for more! Set me ablaze,0
35,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,9


Please add your further ideas for data extraction.
"Annotated" tweets can be another category that we can think about. For now, I don't have a clear idea how to extract annotated tweets. Or, I don't know yet but sentiment analysis may have such a category (I never did sentimant analysis up to now, and I really don't know). This page may give an idea: https://developer.twitter.com/en/docs/labs/overview/whats-new/annotations



---



# **Text cleaning techniques**

### Lower case

The first pre-processing step which we will do is transform our tweets into lower case. This avoids having multiple copies of the same words. For example, while calculating the word count, ‘Analytics’ and ‘analytics’ will be taken as different words.

In [0]:
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['text'].head()

31    @bbcmtd wholesale markets ablaze http://t.co/l...
32    we always try to bring the heavy. #metal #rt h...
33    #africanbaze: breaking news:nigeria flag set a...
34                   crying out for more! set me ablaze
35    on plus side look at the sky last night it was...
Name: text, dtype: object

Removing Punctuation

In [0]:
df['text'] = df['text'].str.replace('[^\w\s]','')
df['text'].head()

31    bbcmtd wholesale markets ablaze httptcolhyxeohy6c
32    we always try to bring the heavy metal rt http...
33    africanbaze breaking newsnigeria flag set abla...
34                    crying out for more set me ablaze
35    on plus side look at the sky last night it was...
Name: text, dtype: object

Removal of Stop Words

In [0]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['text'].sample(10)

5175    meek mill begging nicki minaj let obliterate o...
1055    asymbina tithenai im hampered liking crossbody...
3340    worldnews fallen powerlines glink tram update ...
7235    jamesmelville old testimony weapons used promo...
4243    skinny jeans hazardous health socialnews httpt...
7264    eyes smile pretty smile good hair miss luhan e...
3044    nepal earthquake 3 months women fear abuse htt...
1502    catastrophic effects hiroshima nagasaki atomic...
6098    youre lost alone youre sinking like stone carr...
6081           random hole broke street httptcodwu8qqys0v
Name: text, dtype: object

Remove html tags

In [0]:
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('sdasdasdasd http://t.co/7AzE4IoGMe Risk Assessmen ')

# For now, it didn't work. I will come back to this later

'sdasdasdasd http://t.co/7AzE4IoGMe Risk Assessmen '

Spell Correction

We’ve all seen tweets with a plethora of spelling mistakes. Our timelines are often filled with hastly sent tweets that are barely legible at times.

In that regard, spelling correction is a useful pre-processing step because this also will help us in reducing multiple copies of words. For example, “Analytics” and “analytcs” will be treated as different words even if they are used in the same sense.

To achieve this we will use the textblob library. 

In [0]:
from textblob import TextBlob
df['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

31    bbcmtd wholesale markets ablaze httptcolhyxeohy6c
32    always try bring heavy metal it httptcoyao1e0xngw
33    africanbaze breaking newsnigeria flag set abla...
34                                    crying set ablaze
35    plus side look sky last night ablaze httptcoqq...
Name: text, dtype: object

In [0]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [0]:
df['text'] = df.text.apply(round1)
df.text

31                       bbcmtd wholesale markets ablaze 
32                       always try bring heavy metal rt 
33      africanbaze breaking newsnigeria flag set abla...
34                                      crying set ablaze
35                  plus side look sky last night ablaze 
                              ...                        
7578                                cameronhacker wrecked
7579    three days work theyve pretty much wrecked hah...
7580    fx forex trading cramer igers  words wrecked d...
7581    engineshed great atmosphere british lion gig t...
7582      cramer igers  words wrecked disneys stock cnbc 
Name: text, Length: 7552, dtype: object

In [0]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [0]:
df['text'] = df.text.apply(round2)
df.text

31                       bbcmtd wholesale markets ablaze 
32                       always try bring heavy metal rt 
33      africanbaze breaking newsnigeria flag set abla...
34                                      crying set ablaze
35                  plus side look sky last night ablaze 
                              ...                        
7578                                cameronhacker wrecked
7579    three days work theyve pretty much wrecked hah...
7580    fx forex trading cramer igers  words wrecked d...
7581    engineshed great atmosphere british lion gig t...
7582      cramer igers  words wrecked disneys stock cnbc 
Name: text, Length: 7552, dtype: object

In [0]:
freq = pd.Series(' '.join(df['text']).split()).value_counts()[:20]
freq

like         344
amp          298
im           296
fire         246
get          229
new          224
via          219
people       193
one          193
dont         191
news         191
video        165
us           163
emergency    156
disaster     151
police       138
would        131
still        129
body         124
burning      120
dtype: int64

# Basic Feature Extraction - 2

###  Number of Words

In [0]:
df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df[['text','word_count']].head()

Unnamed: 0,text,word_count
31,bbcmtd wholesale markets ablaze,5
32,always try bring heavy metal rt,7
33,africanbaze breaking newsnigeria flag set abla...,8
34,crying set ablaze,3
35,plus side look sky last night ablaze,8


In [0]:
null_values=df.isna().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
sum_tot=len(df)
null_values['percent']=null_values['null']/sum_tot*100
round(null_values,3).sort_values('percent',ascending=False)

Unnamed: 0,null,percent
id,0,0.0
keyword,0,0.0
text,0,0.0
target,0,0.0
stopwords,0,0.0
punctuation,0,0.0
hastags,0,0.0
numerics,0,0.0
upper,0,0.0
word_count,0,0.0


### Number of characters

In [0]:
df['char_count'] = df['text'].str.len() ## this also includes spaces
df[['text','char_count']].head()

Unnamed: 0,text,char_count
31,bbcmtd wholesale markets ablaze,32
32,always try bring heavy metal rt,32
33,africanbaze breaking newsnigeria flag set abla...,53
34,crying set ablaze,17
35,plus side look sky last night ablaze,37


### 3) Average Word Length

In [0]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/(len(words)+0.000001))


In [0]:
df['avg_word'] = df['text'].apply(lambda x: avg_word(x))
df[['text','avg_word']].head()

Unnamed: 0,text,avg_word
31,bbcmtd wholesale markets ablaze,6.999998
32,always try bring heavy metal rt,4.333333
33,africanbaze breaking newsnigeria flag set abla...,6.571428
34,crying set ablaze,4.999998
35,plus side look sky last night ablaze,4.285714


In [0]:
df.sample(10)

Unnamed: 0,id,keyword,text,target,stopwords,punctuation,hastags,numerics,upper,word_count,char_count,avg_word
7379,10562,windstorm,texas seeks comment rules changes windstorm in...,0,3,5,0,0,0,8,69,7.749999
5978,8538,screaming,screaming different languages httptcordfaakkbnj,0,0,5,0,1,4,5,48,10.999997
3567,5098,famine,savagenation reminds peasants destroyed food s...,1,7,6,0,0,0,12,101,7.499999
135,195,aftershock,anyone need pu tonight play hybrid slayer eu ...,0,1,11,0,0,6,16,92,6.416666
807,1173,blight,anellatulip theory makes way much sense says d...,0,12,1,0,0,0,12,77,5.5
7215,10334,weapon,iranian warship points weapon american helicop...,1,1,8,0,0,0,7,50,7.333332
4780,6801,loud%20bang,actionmoviestaughtus things actually explode l...,0,3,5,1,0,0,7,62,7.999999
3262,4687,engulfed,suelinflower words describe physical painthey ...,0,10,1,0,0,0,12,94,6.916666
7282,10422,whirlwind,set new record states days dont even know wa...,0,8,9,0,2,3,17,93,5.133333
6261,8946,storm,storm came fuck cool,1,7,2,0,0,0,4,20,4.249999
