In [4]:
import pandas as pd
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import textstat

# Função para calcular o índice Flesch-Kincaid
def calculate_flesch_kincaid(text):
    # Usando a biblioteca textstat para calcular o índice Flesch-Kincaid
    return textstat.flesch_kincaid_grade(text)


In [6]:
import spacy
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import language_tool_python

# Carregar modelo do spaCy
nlp = spacy.load('en_core_web_sm')

# Instanciar o LanguageTool
tool = language_tool_python.LanguageTool('en-US')

# Funções para extração de características
def count_grammar_errors(text):
    # Use o objeto LanguageTool para verificar erros gramaticais no texto
    matches = tool.check(text)
    return len(matches)

def calculate_coherence(doc):
    # Convertendo doc.sents em uma lista para acessar sequencialmente
    sents_list = list(doc.sents)
    # Exemplo simples para calcular a média da coerência
    return np.mean([sent.similarity(next_sent) for sent, next_sent in zip(sents_list, sents_list[1:])])

def calculate_lexical_diversity(doc):
    # Exemplo simples para calcular a diversidade lexical
    return len(set([token.text.lower() for token in doc])) / len(doc)

def count_named_entities(doc):
    # Exemplo simples para contar entidades nomeadas
    return len([ent for ent in doc.ents])


# Adicionando o cálculo do Flesch-Kincaid no extrator de características
class TextFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features = []
        for doc in self.nlp.pipe(X):
            features.append([
                len(doc),  # Comprimento do texto
                count_grammar_errors(doc.text),  # Número de erros gramaticais
                calculate_coherence(doc),  # Medida de coerência
                calculate_lexical_diversity(doc),  # Diversidade lexical
                count_named_entities(doc),  # Número de entidades nomeadas
                calculate_flesch_kincaid(doc.text)  # Índice Flesch-Kincaid
            ])
        return np.array(features)

In [7]:
df = pd.read_csv('train.csv')

In [8]:
df['score'].value_counts()

score
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

In [9]:
#Remover somnete as quebras de linhas e deixar em minuscula
df['full_text']=df['full_text'].str.lower().str.replace('\n', ' ')

#remover pontuação
df['full_text']=df['full_text'].apply(lambda x: re.sub(r'^\s*|\s*$|[^\w\s]|[\d]', ' ', str(x))).str.lower().str.replace('\n', ' ')

In [10]:
n_sample = 200

def sample (df, group_column, n):
    return df.groupby(group_column).apply(lambda x: x.sample(n=min(len(x), n))).reset_index(drop=True)

filter = df[df['score'].isin([1,2,3,4,5,6])]

df_sample = sample(filter, 'score', n_sample)
df_sample['score'].value_counts()

score
1    200
2    200
3    200
4    200
5    200
6    156
Name: count, dtype: int64

In [11]:
# Instanciar o extrator de características
extractor = TextFeaturesExtractor(nlp)

# Aplicar a transformação no DataFrame
features_array = extractor.transform(df_sample['full_text'])

# Criar novas colunas no DataFrame com as características extraídas
df_sample['comprimento_texto'] = features_array[:, 0]
df_sample['num_grammatical_errors'] = features_array[:, 1]
df_sample['coherence_score'] = features_array[:, 2]
df_sample['lexical_diversity'] = features_array[:, 3]
df_sample['num_named_entities'] = features_array[:, 4]
df_sample['Flesch-Kincaid'] = features_array[:, 5]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return np.mean([sent.similarity(next_sent) for sent, next_sent in zip(sents_list, sents_list[1:])])


In [12]:
# Calcular a correlação entre as variáveis
correlation_matrix = df_sample[['score', 'comprimento_texto', 'num_grammatical_errors', 'coherence_score', 'lexical_diversity', 'num_named_entities',
                               'Flesch-Kincaid']].corr()

correlation_matrix

Unnamed: 0,score,comprimento_texto,num_grammatical_errors,coherence_score,lexical_diversity,num_named_entities,Flesch-Kincaid
score,1.0,0.814624,0.668643,0.336449,-0.675155,0.34891,0.81463
comprimento_texto,0.814624,1.0,0.900317,0.292852,-0.72858,0.538793,0.998863
num_grammatical_errors,0.668643,0.900317,1.0,0.268388,-0.59744,0.651028,0.88538
coherence_score,0.336449,0.292852,0.268388,1.0,-0.194919,0.182756,0.287756
lexical_diversity,-0.675155,-0.72858,-0.59744,-0.194919,1.0,-0.297744,-0.730094
num_named_entities,0.34891,0.538793,0.651028,0.182756,-0.297744,1.0,0.526345
Flesch-Kincaid,0.81463,0.998863,0.88538,0.287756,-0.730094,0.526345,1.0


In [27]:
# Preparar os dados para o modelo de classificação
X = df_sample[['comprimento_texto', 'num_grammatical_errors', 'lexical_diversity', 'num_named_entities','Flesch-Kincaid']]
y = df_sample['score']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# usando regressão logistica 
classify = Pipeline([
    ('classifier', LogisticRegression(max_iter=10000, random_state=42))])

# Treinar o modelo no conjunto de treino completo
classify.fit(X_train, y_train)

# Obter as previsões no conjunto de teste
y_test_pred = classify.predict(X_test)

scores = cross_val_score(classify, X_test, y_test, cv=4, scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Calcular e exibir o relatório de classificação para o conjunto de teste
test_report = classification_report(y_test, y_test_pred, digits=2)
print("Relatório de Classificação - Conjunto de Teste:\n", test_report)

Accuracy: 0.47 (+/- 0.11)
Relatório de Classificação - Conjunto de Teste:
               precision    recall  f1-score   support

           1       0.49      0.56      0.52        39
           2       0.60      0.64      0.62        45
           3       0.46      0.36      0.40        47
           4       0.36      0.41      0.38        34
           5       0.49      0.42      0.45        40
           6       0.61      0.63      0.62        27

    accuracy                           0.50       232
   macro avg       0.50      0.51      0.50       232
weighted avg       0.50      0.50      0.50       232

