# Proyecto 2

In [1]:
#!pip install pyspellchecker
#!pip intall textatistic
#!pip install -U LeXmo
#!pip install emot
#!pip install nrclex
#!pip install swifter
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!pip install enchant
#!pip install language_tool_python

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from tqdm import tqdm
import swifter

import string
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from textatistic import Textatistic
from LeXmo import LeXmo
import emot
import nrclex
from language_tool_python import LanguageTool

import matplotlib.pyplot as plt
import seaborn as sns

#nltk.download('punkt')
#nltk.download('stopwords')
tqdm.pandas()
nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()
tool = LanguageTool('en-US')
lemmatizer = WordNetLemmatizer()
STOP_WORDS = stopwords.words('english')
PUNCT_MARKS = list(string.punctuation)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\osjom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


## Preprocesamiento de Datos

### Fase 1

In [6]:
def spelling_errors(text):
    words = text.split()
    misspelled = spell.unknown(words)
    return len(misspelled)

def grammar_errors(text):
    matches = tool.check(text)
    return len(matches)

def punctuation_marks(text):
    return sum([1 for char in text if char in PUNCT_MARKS])

In [7]:
data['spelling_errors'] = data['discourse_text'].swifter.apply(spelling_errors)
data['grammar_errors'] = data['discourse_text'].swifter.apply(grammar_errors)
data['punctuation_marks'] = data['discourse_text'].swifter.apply(punctuation_marks)
data.head()

Pandas Apply:   0%|          | 0/36765 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/36765 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Fase 2

In [None]:
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what's": "what is",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

def handle_contractions(text):
    words = word_tokenize(text)
    words = [contractions[word] if word in contractions else word for word in words]
    return ' '.join(words)

def spell_correction(text):
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
    return ' '.join(corrected_words)

def end_sentence(text):
    if text[-1] not in ['!', '?', '.']:
        return text + '.'
    return text

In [None]:
data['discourse_text'] = data['discourse_text'].swifter.apply(handle_contractions)
data['discourse_text'] = data['discourse_text'].swifter.apply(spell_correction)
data['discourse_text'] = data['discourse_text'].swifter.apply(end_sentence)
data['discourse_text'] = data['discourse_text'].str.lower()
data.head()

### Fase 3

In [None]:
def readability_features(text):
    s = Textatistic(text)
    return pd.Series([s.sent_count, s.word_count, s.sybl_count, s.char_count, s.polysyblword_count, s.dalechall_score,
                      s.flesch_score, s.fleschkincaid_score, s.gunningfog_score, s.smog_score])

def avg_word_length(text):
    words = text.split()
    word_lengths = [len(word) for word in words]
    avg_word_length = sum(word_lengths)/len(words)
    return(avg_word_length)

def stop_words(text):
    words = text.split()
    return sum(1 for word in words if word.lower() in STOP_WORDS)

def lexical_types(text):
    words = word_tokenize(text)
    lexical_types = set(words)
    return len(lexical_types)

def syntatic_complexity_features(text):
    doc = nlp(text)
    total_tokens = len(doc)
    num_clauses = len([token for token in doc if token.dep_ == "ROOT"])
    num_sub_clauses = len([token for token in doc if token.dep_ == "acl" or token.dep_ == "advcl"])
    num_verb_phrases = len([chunk for chunk in doc.noun_chunks if chunk.root.pos_ == "VERB"])
    num_complex_noun_phrases = len([chunk for chunk in doc.noun_chunks if len(chunk) > 1])

    return pd.Series([num_clauses / total_tokens, num_sub_clauses / total_tokens, num_verb_phrases / total_tokens,
                      num_complex_noun_phrases / total_tokens])

In [None]:
data['text_length'] = data['discourse_text'].swifter.apply(len)
data[['sent_count', 'word_count', 'sybl_count', 'char_count', 
      'polysyblword_count', 'dalechall_score', 'flesch_score',
     'fleschkincaid_score', 'gunningfog_score', 'smog_score']] = data['discourse_text'].swifter.apply(readability_scores)
data['stop_words'] = data['discourse_text'].swifter.apply(stop_words)
data['lexical_types'] = data['discourse_text'].swifter.apply(lexical_types)
data[['clauses_prop', 'sub_clauses_prop', 
      'verb_phrases_prop', 'noun_phrases_prop']] = data['discourse_text'].swifter.apply(syntatic_complexity_features)

In [None]:
data['chars_per_word'] = data['char_count'] / data['word_count']
data['words_per_sentence'] = data['word_count'] / data['sent_count']
data['sybl_per_sentence'] = data['sybl_count'] / data['sent_count'] 
data.head()

### Fase 4

In [7]:
def remove_stop_words(text):
    removed_stop_words = ' '.join([word for word in text.split() if word not in STOP_WORDS])
    return removed_stop_words

def lemmatize_text(text):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text.split()]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

def emotions(text):
    emo = LeXmo.LeXmo(text)
    return pd.Series([emo['anger'], emo['anticipation'], emo['disgust'], emo['fear'], emo['joy'], emo['negative'],
                      emo['positive'], emo['sadness'], emo['surprise'], emo['trust']])

In [None]:
data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
data['discourse_text'] = data['discourse_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['discourse_text'] = data['discourse_text'].swifter.apply(lemmatize_text)
data[['anger', 'anticipation', 'disgust', 
      'fear', 'joy', 'negative', 'positive',
     'sadness', 'surprise', 'trust'] = data['discourse_text'].swifter.apply(remove_stop_words).swifter.apply(emotions)
data.head()

### Exportacion

In [11]:
data.to_csv('train_features.csv', index=False)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,word_count,avg_word_length,sentences_count,words_per_sentence,readability
0,0013cc385424,007ACE74B050,hi isaac going writing face mars natural landf...,Lead,Adequate,67,3.731343,3,22.333333,7.89403
1,9704a709b505,007ACE74B050,perspective think face natural landform dont t...,Position,Adequate,41,4.121951,2,20.5,11.126829
2,c22adee811b6,007ACE74B050,think face natural landform life mars descover...,Claim,Adequate,21,4.0,1,21.0,12.209524
3,a10d361e54e4,007ACE74B050,life mars would know reason think natural land...,Evidence,Adequate,72,4.027778,4,18.0,8.537778
4,db3e453ec4e2,007ACE74B050,people thought face formed alieans thought lif...,Counterclaim,Adequate,18,4.611111,1,18.0,3.6
