# Data Preprocessing for Twitter

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import csv
import sys
# from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import time
import os
from IPython.display import Image
from IPython.display import display
# os.remove('final_twitter_preprocessing.csv')
# os.remove('twitter_preprocessing.csv')

### Load the Twitter Dataset
# Remove duplicates, NA, preserve only English texts
disease = pd.read_csv('TW_Tweet.csv', encoding = 'UTF-8', low_memory = False)
df = pd.DataFrame(disease, columns = ['id', 'keyword', 'created', 'language', 'message'])
df.columns = ['id', 'key', 'created_time', 'language', 'message']
rm_duplicates = df.drop_duplicates(subset = ['key', 'message'])
rm_na = rm_duplicates.dropna()
dtime = rm_na.sort_values(['created_time'])
dtime.index = range(len(dtime))
dlang = dtime[dtime['language'] == 'en']
dlang = dlang[dlang['key'] != 'johnson & johnson']
dlang = dlang[dlang['key'] != 'johnson&johnson']
dlang.index = range(len(dlang))
display(dlang.head(3))
print(len(dlang))

Unnamed: 0,id,key,created_time,language,message
0,836000000000000000,ibrutinib,2017-02-26T22:42:00.000Z,en,RT @szusmani: Acquired mutations associated wi...
1,837000000000000000,psoriasis,2017-03-01T03:16:00.000Z,en,RT @PuckMuckDuck: NJ Muckers - Stop & Thank ou...
2,838000000000000000,psoriasis,2017-03-03T14:00:00.000Z,en,RT @NewingtonComms: thanks to @NParveenG @sean...


153897


# Language Detection & Translation

Language detection and Translation is implementes by [Yandex API](https://tech.yandex.com/translate/). <br>
Here are two function: **get_translation_direction**, **translation**.<br>
It is able to detect the language of the text and translate into the language we specify. 

In [2]:
### First need to login and get the Yandex API key from https://tech.yandex.com/translate/

import json
import requests
from urllib.request import urlopen

# Add your own key here
api_key = "trnsl.1.1.20170328T192339Z.3d51057a7bbbe500.4ae809a6429ea2565ff027216b5233208c1c4f90"

# Detect the language of text
def get_translation_direction(api_key,text):
    url = "https://translate.yandex.net/api/v1.5/tr.json/detect?"
    url = url + "key=" + api_key
    if(text != ""):
        url = url+"&text="+text
    r = requests.get(url)
    return (r.json()['lang'])
    
# Translate the text into English
def translation(api_key,text,lang):
    url = "https://translate.yandex.net/api/v1.5/tr.json/translate?"
    url = url + "key=" + api_key
    if(text != ""):
        url = url + "&text=" + text
    if(lang != ""):
        url = url + "&lang=" + lang
    r = requests.get(url)
    print(''.join(r.json()['text']))
    return(''.join(r.json()['text']))
    
# Add the text you want to detect and the language you want to translate
# For lang, you can check here to see the code of language you want to translate https://tech.yandex.com/translate/doc/dg/concepts/api-overview-docpage/
# Below is an example for language translation process
text = "Do you know that she is coming?"
lang = "de"
print("Language Detection:")
print(get_translation_direction(api_key, text), ',', text)
print("Translation:")
print(lang, ',', translation(api_key, text, lang))

Language Detection:
en , Do you know that she is coming?
Translation:
Sie wissen, dass Sie kommen wird?
de , Sie wissen, dass Sie kommen wird?


# Final Preprocessing

In this section, preprocessing is implemented into following steps.<br>

| Preprocessing Steps| Packages           | Notes                             |
|------------------- |--------------------|-----------------------------------|
| Remove Url         | Regular expression |                                   |
| Remove Stopwords   | nltk.corpus        |                                   |
| Remove Punctuation | string.punctuation |                                   |
| Lemmatizing        | nltk.stem          |Lemmatize words in Noun and Verb   |
| Part of Speech(POS)| nltk.pos_tag       |Preserve Noun, Adverb and Adjective|
| Tokenize           | split              |Unigram                            |
| Remove NA          | pandas             |                                   |
| Drop Duplicates    | pandas             |                                   |

In [3]:
import gensim
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
import string
import time

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Create a new csv file to store the result after data preprocessing
with open(
        'twitter_preprocessing.csv',
        'w',
        encoding = 'UTF-8',
        newline = '') as csvfile:
    column = [[
        'id', 'key', 'created_time', 'language', 'message', 're_message'
    ]]
    writer = csv.writer(csvfile)
    writer.writerows(column)
    
# Data preprocessing steps   
for i in range(len(dlang['message'])):
    features = []
    features.append(str(int(dlang['id'][i])))
    features.append(dlang['key'][i])
    features.append(dlang['created_time'][i])
    features.append(dlang['language'][i])
    features.append(dlang['message'][i])
    reurl = re.sub(r"http\S+", "", str(dlang['message'][i]))
    tokens = ' '.join(re.findall(r"[\w']+", reurl)).lower().split()
    x = [''.join(c for c in s if c not in string.punctuation) for s in tokens]
    x = ' '.join(x)
    stop_free = " ".join([i for i in x.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word,pos = 'n') for word in punc_free.split())
    normalized = " ".join(lemma.lemmatize(word,pos = 'v') for word in normalized.split())
    word = " ".join(word for word in normalized.split() if len(word)>3)
    postag = nltk.pos_tag(word.split())
    irlist = [',','.',':','#',';','CD','WRB','RB','PRP','...',')','(','-','``','@']
    poslist = ['NN','NNP','NNS','RB','RBR','RBS','JJ','JJR','JJS']
    wordlist = ['co', 'https', 'http','rt','com','amp','fe0f','www','ve','dont',"i'm","it's",'isnt','âźă','âąă','âł_','kf4pdwe64k']
    adjandn = [word for word,pos in postag if pos in poslist and word not in wordlist and len(word)>3]
    stop = set(stopwords.words('english'))
    wordlist = [i for i in adjandn if i not in stop]
    features.append(' '.join(wordlist))
    with open('twitter_preprocessing.csv', 'a', encoding = 'UTF-8', newline = '') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows([features])
df_postncomment = pd.read_csv('twitter_preprocessing.csv', encoding = 'UTF-8', sep = ',')
df_rm = df_postncomment.drop_duplicates(subset=['id', 're_message'])
rm_english_na = df_rm.dropna()
rm_english_na.index = range(len(rm_english_na))
dfinal_tw = pd.DataFrame(
    rm_english_na,
    columns = ['id', 'key', 'created_time', 'language', 'message', 're_message'])
dfinal_tw.to_csv(
    'final_twitter_preprocessing.csv',
    encoding = 'UTF-8',
    columns = ['id', 'key', 'created_time', 'language', 'message', 're_message'])
os.remove('twitter_preprocessing.csv')

In [5]:
test = pd.read_csv('final_twitter_preprocessing.csv', encoding = 'UTF-8', sep = ',', index_col = 0)
display(test.head(3))
print(len(test))

Unnamed: 0,id,key,created_time,language,message,re_message
0,836000000000000000,ibrutinib,2017-02-26T22:42:00.000Z,en,RT @szusmani: Acquired mutations associated wi...,szusmani mutation associate ibrutinib resistan...
1,837000000000000000,psoriasis,2017-03-01T03:16:00.000Z,en,RT @PuckMuckDuck: NJ Muckers - Stop & Thank ou...,puckmuckduck muckers thank national sponsor ab...
2,838000000000000000,psoriasis,2017-03-03T14:00:00.000Z,en,RT @NewingtonComms: thanks to @NParveenG @sean...,newingtoncomms thank nparveeng seananstee burn...


153861
