# Loading and Cleaning Data

In [3]:
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from textblob import TextBlob
sw = nltk.corpus.stopwords.words('english')


## Loading Data

In [4]:
# Reading txt files and creating the complete dataset

path="./data/conferences"

# list all sub-directories
persons = os.listdir(path)
dict_temp = {}

col_1=[]
col_2=[]
col_3=[]

for person in persons:
    
    files = os.listdir(path + "/" + person)
    
    for file in files:
        with open(path + "/" + person + "/" + file, 'r') as f:
            content = f.read()
            
        col_1.append(person)
        col_2.append(file)
        col_3.append(content)
        

content= {'speaker': col_1, 'conf_number': col_2, 'transcript':col_3}
data = pd.DataFrame(content)

data.head()

Unnamed: 0,speaker,conf_number,transcript
0,boris_johnson,5.txt,I wish I could tell you that this pandemic tha...
1,boris_johnson,4.txt,"Thank you, Mr. President, Your Excellencies, l..."
2,boris_johnson,3.txt,"Mr. Speaker, thank you very much. And with you..."
3,boris_johnson,2.txt,"And there are many, many reasons for me, I sho..."
4,boris_johnson,1.txt,Thank you. And it may seem a bit premature to ...


In [5]:
# Total Words New Column
data['totalwords'] = [len(x.split()) for x in data['transcript'].tolist()]

In [6]:
def  text_processing(df, column):
    
    '''
    Args: Receive a dataframe and the column name to clean
    Function: Transform the indicated column by removing stop words, lowering case, 
              spelling correction, removing punctuation and removing numbers.
    Return: the clean dataframe
    '''
    
    df[column] = df[column].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df[column] = df[column].apply(lambda x: " ".join(x for x in x.split()if x not in sw))
    df[column].apply(lambda x: str(TextBlob(x).correct()))
    df[column] = df[column].str.replace('[^\w\s]', '')
    df[column] = df[column].str.replace('[0-9]', '')
    
    return df

## Cleaning Data

In [7]:
new_data = text_processing(data, 'transcript')

In [8]:
new_data.head()

Unnamed: 0,speaker,conf_number,transcript,totalwords
0,boris_johnson,5.txt,"wish could tell pandemic we’re going over, wis...",4270
1,boris_johnson,4.txt,"thank you, mr. president, excellencies, ladies...",2125
2,boris_johnson,3.txt,"mr. speaker, thank much. permission, make stat...",1713
3,boris_johnson,2.txt,"many, many reasons me, say, come exeter colleg...",2374
4,boris_johnson,1.txt,thank you. may seem bit premature make speech ...,3797


In [9]:
new_data.to_csv("./data/cleaned_data.csv")