# Data Cleaning

### Goal of this notebook:  
Clean and organize the text data that we scrapped and make 2 files format:
- Corpus: a collection of text
- Document-Term Matrix: word counts in matrix format 

In [1]:
# dependencies
import re
import json
import string
import nltk
import numpy as np
import pickle
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download()    uncomment this line when running the cell for the first time 
pd.set_option('max_colwidth',200)

from sklearn.feature_extraction.text import CountVectorizer

## 1. Load texts

In [2]:
# Put here the path of the json file saved from src/data/data_processing.py
data_folder_path = '/Users/mouhamethtakhafaye/Desktop/behavox_assignment/data/raw.json'

# Path for outfiles
outfile_path = '/Users/mouhamethtakhafaye/Desktop/behavox_assignment/data/'

In [3]:
load_ = []
with open(data_folder_path) as f:
    for line in f:
        load_.append(json.loads(line))

df = pd.DataFrame.from_dict(load_, orient='columns').transpose()
df.columns = ['Messages']

In [4]:
# See the collected text from sms and chats emails folders
df

Unnamed: 0,Messages
SMS,\n Sms #2\n \n Hi Ina! How are you?\n \n thx i getting better\n \n What ...
CHATS,"\n Hello?\n \n Morning\n \n Yeah, how are you?\n \n I'm pretty good sir. Very Good. We're just not, ..."
EMAILS,"Please let me know if you still need Curve Shift. Thanks, Heather -----Original Message----- From: Allen, Phillip K. Sent: Friday, December 07, 2001 5:14 AM To: Dunton, Heather Subject: RE: W..."


## 2. Quick overview of each channel of communications

Remove comments to see raw texts

In [5]:
#df.Messages.loc['CHATS']

In [6]:
#df.Messages.loc['SMS']

In [7]:
#df.Messages.loc['EMAILS']

## 3. Apply a first round of text cleaning techniques

In [8]:
# Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers
def clean_text_round1(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Remove square brackets
round1 = lambda x: clean_text_round1(x)

In [9]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(df.Messages.apply(round1))
#data_clean.Messages.loc['EMAILS']  # to see text for chats or sms, replace EMAILS by CHATS or SMS

## 4. Apply a second round of cleaning

In [10]:
# Get rid of some additional quotation marks and newline text that was missed the first time around.
def clean_text_round2(text):
    
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [11]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.Messages.apply(round2))

## 5. Stop words removal

In [12]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mouhamethtakhafaye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# langages supported by nltk
stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

We see that the chinese and hindu langage are not supported. We will try to remove them from our dataset. 

In [14]:
# Loading the stop words
stop_words = list(stopwords.words())

In [15]:
for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    data_clean['Messages'] = data_clean['Messages'].str.replace(regex_stopword, '')

### Remove chinese and hindu characters since they are not supported by ntlk 

In [16]:
def clean_text_round3(text):
    # Manually copied as much as possible the chinese and hindu characters
    
    #(I think it is hindu characters, Im not sure as I dont know much about the hindu language).
    
    text = re.sub('是东亚地区统一的一党主权国家门特别行政区	照总面积计算現在裤脚許多人需要同意還沒有重庆鞋子全部打湿完了\
                  我會等待確認裤脚鞋子全部打湿完了こんにちはこれは日本語の例です', '', text)
    return text

round3 = lambda x: clean_text_round3(x)

In [17]:
data_clean = pd.DataFrame(data_clean.Messages.apply(round3))
data_clean

Unnamed: 0,Messages
SMS,sms hi ina thx getting better hell happened damn call asap ...
CHATS,hello morning yeah pretty good sir good allowed conversations chat ...
EMAILS,please let know still need curve shift thanks heather original message phillip sent friday december dunton heather subject west position heather attach file email origina...


 ### NOTE:
This data cleaning aka text pre-processing step could go on for a while, but we are going to stop for now. After going through some analysis if we see that the results don't make sense or could be improved, we will come back and make more edits such as stemming and lemmatization.

In [18]:
# Let see the result that we have so far...
data_clean.Messages.loc['EMAILS'] # to see text for chats or sms, replace EMAILS by CHATS or SMS

'please let  know   still need curve shift  thanks heather  original message    phillip    sent friday december      dunton heather subject  west position  heather    attach  file   email   original message   dunton heather   sent wednesday december    pm   phillip  belden tim subject fw west position  attached   delta position          original message    phillip    sent wednesday december      dunton heather subject  west position  heather    exactly   need  would  possible  add  prior day     dates    pivot table   order  validate  curve shift   dates    need  prior days ending positions  thank   phillip    original message   dunton heather   sent tuesday december    pm  belden tim  phillip  cc driscoll michael  subject west position   attached   delta position            file westdeltaposxls    let  know     questions   heather david coursey  hope ahead   learned   tragedies       years go  sucked  adversity teaches          us  important lessons  prosperity            bet      upc

### 5. Save Corpus 

In [19]:
data_clean.shape

(3, 1)

In [20]:
data_clean_ = data_clean.to_json()

In [21]:
df_ = df.to_json()

In [22]:
with open(outfile_path + 'clean_corpus' + ".json", 'w') as corpus:
    json.dump(data_clean_, corpus)
    
# lets also save our raw corpus it might be usefull later    
with open(outfile_path + 'raw_corpus' + ".json", 'w') as corpus:
    json.dump(df_, corpus)
    
    
# Save pickles files 
with open('Pickles/clean_corpus.pickle', 'wb') as output:
    pickle.dump(data_clean, output)

    
with open('Pickles/raw_corpus.pickle', 'wb') as output:
    pickle.dump(df, output)

### 6. Save Document-Term Matrix

In [23]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.Messages) # fit count vectorizor to our CLEAN messages data

# Convert it to an array and label all the columns
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index

# Document-Term matrix
data_dtm

Unnamed: 0,able,abn,accelerated,accelerating,accenture,acceptance,accepted,access,accessing,accessories,...,我會等待確認,是东亚地区统一的一党主权国家,澳门特别行政区,照总面积计算,現在,裤脚,許多人需要同意,還沒有,重庆,鞋子全部打湿完了
SMS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
CHATS,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,0
EMAILS,3,2,1,1,5,9,1,11,1,2,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Save pickles files 
with open('Pickles/term_matrix.pickle', 'wb') as output:
    pickle.dump(data_dtm, output)