# Disaster Tweet Classification

***Console***

In [10]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


***Importing Libraries***

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

***Importing Files***

In [5]:
testCSV = pd.read_csv("./data/test.csv")
trainCSV = pd.read_csv("./data/train.csv")


***Viewing data***

In [6]:
trainCSV.isnull().sum()
trainCSV.shape

(7613, 5)

***Cleaning Text and Removing stopwords***

In [7]:
#Loops through all the text

def cleanText(df):
    
    stopWords = set(stopwords.words("english"))
    
    for index, row in df.iterrows():
        curText = row.text
        
        #Makes lowercase
        curText = curText.lower()
        #removing stop words
        curText = " ".join(filter(lambda x:x not in stopWords, curText.split()))
        #removing all words starting with @
        curText = " ".join(filter(lambda x:x[0] != "@", curText.split()))
        # removes all links
        webStart = "http"
        curText = " ".join(filter(lambda x:x[0:len(webStart)] != webStart, curText.split()))
        # removing all non alpha numeric char exlcuding period
        curText = re.sub(r'[^a-z0-9. ]+', '',curText)
        # removing "..." (multiple periods in a row)
        curText = re.sub(r'([.])\1+', '',curText)
        # removing multiple spaces in a row
        curText = re.sub(r'([ ])\1+', '',curText)
        
            # removing multiple of the same character in a row
            # curText = re.sub(r'([a-z])\3+', r'\2',curText)
            
        df.at[index,'text'] = curText
        
        
cleanText(testCSV)
cleanText(trainCSV)


***Lemmitization***

In [11]:
def lemm_tweets(df):
    nlp = spacy.load('en_core_web_sm')
    #convert the tweet column in to a list of strings
    tweets = df['text'].tolist()
    i = 0;
    for tweet in tweets:
        doc = nlp(tweet) #get current tweet
        lemmatized_tweet = ''
        for token in doc:
            lemmatized_tweet += token.lemma_ # each token will now be lemmatized 
        tweets[i] = token.lemma_
        i+=1
    print(tweets)

lemm_tweets(testCSV)
lemm_tweets(trainCSV)    

['crash', '.', 'save', 'wildfire', 'taiwan', 'earthquake', 'eh', 'you', 'hat', 'off', 'cold', 'that', 'that', 'if', 'awesome', 'market', 'ablaze', 'ablaze', 'nsfw', 'follow', '.', 'via', 'ablaze', 'ablaze', 'ablaze', 'wonder', 'pulsradio', '.', '.', 'traffic', '.', 'accident', 're', 'accident', 'a283', 'anime', 'dogbite', 'happen', 'out', 'c', 'p', 'pdx911', 'like', 'paraplegic', 'fall', 'tem', 'new', 'ice', 'collin', 'ice', 'will.unknown', 'gradschoolapp', 'bond', 'ice', 'aftershock', 'eisenhower', 'invahnwetrust', 'aftershock', '.', 'aftershock', '.', 'cali', 'ago', 'week', '.', 'airplane29072015', 'makeup', '29072015', '.', '.', 'accident', 'yankee', 'wedn', 'e', 'wedn', 'crash', 'ambulance', 'compliantebay', 'laugh', '.', 'ambulancerainbowpower', 'science', 'ambulance', 'ambulance', 'crash', '29800', 'whatsyouremergency', 'day', 'via', 'via', 'annihilate', '.', '.', 'mosquito', '.', 'beloved', 'annihilate', 'nothing', '.', 'annihilate', 'annihilate', 'dd', 'hour', 'saltriverwildhor



***Vectorization***

In [18]:
def vectorize_tweets(df): 
    cv = CountVectorizer() #instance of countvectorizer
    tweets = df['text'].tolist() #covert tweet col to a list
    

ValueError: Iterable over raw text documents expected, string object received.