In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jonat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jonat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Limpieza de Datos

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [3]:
duplicated_discourse_ids = data[data['discourse_id'].duplicated(keep=False)]
duplicated_discourse_ids.sort_values(by='discourse_id')

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness


In [4]:
def handle_contractions(text):
    words = word_tokenize(text)
    words = [contractions[word] if word in contractions else word for word in words]
    return ' '.join(words)

def spell_correction(text):
    spell = SpellChecker()
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    return ' '.join(corrected_words)
    

contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what's": "what is",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

In [None]:
# Convertir todo a minuscula
data['discourse_text'] = data['discourse_text'].str.lower()

data['discourse_text'] = data['discourse_text'].apply(handle_contractions)
data['discourse_text'] = data['discourse_text'].apply(spell_correction)

data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

data['discourse_text'] = data['discourse_text'].apply(lambda x: emoji_pattern.sub(r'', x))
# Quitar signos de puntuacion
data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


In [None]:
stop = stopwords.words('english')
for i in range(len(data)):
    data.loc[i, 'discourse_text'] = ' '.join([word for word in data.loc[i, 'discourse_text'].split() if word not in (stop)])

data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'\d+', '', x))

In [None]:
data.head()

In [None]:
discourse_type_dummies = pd.get_dummies(data['discourse_type'], prefix='discourse_type')
discourse_effectiveness_dummies = pd.get_dummies(data['discourse_effectiveness'], prefix='discourse_effectiveness')

data_dummies = pd.concat([data, discourse_type_dummies, discourse_effectiveness_dummies], axis=1)
data_dummies.drop(['discourse_type', 'discourse_effectiveness'], axis=1, inplace=True)

data_dummies.head()