In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
%matplotlib inline

### Explorando os dados

In [3]:
data = pd.read_csv("./LLM.csv")

In [4]:
data.head()

Unnamed: 0,Text,Label
0,y r u always l8 to the meetings?,student
1,The project team embraced a user-centric desig...,ai
2,"i dont like dealing with risks, it's too stres...",student
3,"i dont worry about reliability, it's good enough",student
4,"i dont care about human-centered design, just ...",student


In [10]:
with pd.option_context('display.max_colwidth', None):
    display(data.sample(10)) 

Unnamed: 0,Text,Label
401,"i dont care about CRISPR technology, just give me the results",student
673,"i dont see the point of professionalism, it's overrated",student
515,"i dont validate surveys, it takes too much time",student
726,The findings have practical implications for real-world applications.,ai
405,The project team consistently met project milestones.,ai
365,The research question is designed to address real-world challenges faced by practitioners.,ai
371,The project team demonstrated adaptability in response to client requirements.,ai
55,"i dont care about biofuels, just give me the data",student
3,"i dont worry about reliability, it's good enough",student
563,"i dont worry about anonymity, it's overrated",student


### Padrões encontrados

Olhando para os exemplares, podemos observar alguns padrões.

1. Os textos de estudantes tendem a ser mais negativos do que os da IA.
2. Os textos da IA tendem a conter vocabulário e gramática melhor.
3. Os textos da IA tendem a ser mais formais.

É provável que essas constatações não sejam generalizáveis no mundo real e que tenham sido encontrados por viés ou erro na coleta dos dados. No entanto, o dataset escolhido não parece ser muito realista e os modelos de aprendizagem de máquina só podem ser tão bons quanto os dados que eles utilizam.

Portanto, iremos, por motivos de aprendizado e didática, generalizar em cima dos dados como estão e as observações acima serão úteis para alguns dos modelos mais simples.

Mas, primeiro, vamos confirmar que esse padrão se repete em outros trechos.


In [11]:
with pd.option_context('display.max_colwidth', None):
    display(data.sample(10)) 

Unnamed: 0,Text,Label
616,The integration of 5G technology is revolutionizing communication and connectivity.,ai
966,"i dont talk about limitations, it makes me look bad",student
519,The analysis considered the impact of outliers on data interpretation.,ai
179,"i dont like communicating, it's a waste of time",student
765,why do we even need to learn this stuff?,student
960,"i dont validate surveys, it takes too much time",student
352,"i dont care about ethics, it's too restrictive",student
482,"i dont care about outliers, just give me the main results",student
211,"i dont like theory, just give me what works",student
546,"i dont document processes, it's too tedious",student


In [12]:
with pd.option_context('display.max_colwidth', None):
    display(data.sample(10)) 

Unnamed: 0,Text,Label
392,The research question is framed to address practical issues faced by industry professionals.,ai
1059,The project team maintained a collaborative and open-door communication policy.,ai
568,"i dont care about implications, just tell me what to do",student
250,The findings contribute to the ongoing discourse within the academic community.,ai
365,The research question is designed to address real-world challenges faced by practitioners.,ai
1073,"i dont care about diverse perspectives, just give me the data",student
414,"i dont like artificial intelligence, it's too unpredictable",student
294,"i dont like communicating progress, it's a waste of time",student
948,"i dont care about employee engagement, just get the work done",student
587,The use of CRISPR technology in agriculture aims for improved crop yields and resistance.,ai


Parece que isso se confirmou nos demais dados com uma única exceção sem a palavra "don't". De qualquer forma, os features parecem bem adequados.

### Criação dos features e outputs

Agora vamos criar um vetor binário representando as nossas observações acima.

Para analisar a correção gramatical e de vocabulária iremos utilizar a razão entre o número de palavras corretas sobre o número de palavras totais. Poderia ser útil também outras verificações simples como a presença de ponto ao fim da frase e início com letra maiúscula, ou então mais complexas utilizando a biblioteca NLTK, que possui o módulo `nltk.grammar`. No entanto, iremos manter nosso modelo simples e utilizaremos somente esse índice.

Para analisar o sentimento de negatividade iremos contar o número de palavras "negativas".

In [14]:
# Fonte: https://github.com/first20hours/google-10000-english/tree/master
with open("words.txt", "r") as words_file:
    words_list = set(words_file.read().splitlines()[:6_000])

words_list

{'sectors',
 'elected',
 'than',
 'la',
 'shoe',
 'expensive',
 'kit',
 'angeles',
 'radiation',
 'woods',
 'hardware',
 'creation',
 'bought',
 'ct',
 'portions',
 'serves',
 'bears',
 'worldwide',
 'ed',
 'agents',
 'step',
 'kinds',
 'thru',
 'william',
 'sur',
 'bonds',
 'cute',
 'accuracy',
 'may',
 'nurse',
 'give',
 'jacksonville',
 'drew',
 'script',
 'perspective',
 'denied',
 'designs',
 'disaster',
 'fall',
 'dakota',
 'tell',
 'dvd',
 'sublime',
 'ftp',
 'minor',
 'z',
 'recommend',
 'philosophy',
 'decisions',
 'michelle',
 'orleans',
 'secure',
 'offers',
 'exciting',
 'local',
 'insider',
 'could',
 'insert',
 'ceiling',
 'hire',
 'rep',
 'skiing',
 'mission',
 'compression',
 'fewer',
 'hearing',
 'port',
 'repair',
 'racing',
 'involving',
 'classroom',
 'physical',
 'creative',
 'overnight',
 'arizona',
 'etc',
 'syndrome',
 'corp',
 'skin',
 'returned',
 'signal',
 'driver',
 'grove',
 'binary',
 'certain',
 'revision',
 'ne',
 'tax',
 'substantial',
 'rough',
 'euro

In [15]:
def text_to_feature(text: str) -> np.ndarray:
    has_dont = 0
    starts_upper = 0
    has_comma = 0

    right_words = 0
    total_words = 0

    words = text.split()
    for word in words:
        if word == "don't":
            has_dont = 1

        if len(word) > 0:
            starts_upper = 1 if word[0].isupper() else 0

            if word[-1] == ",":
                has_comma = 1

        if word.lower().rstrip(",".rstrip("'s")) in words_list:
            right_words += 1

        total_words += 1

    return np.array([has_dont, right_words / total_words, starts_upper, has_comma])

In [19]:
features = data["Text"].map(lambda txt : text_to_feature(txt)).to_numpy()
features, len(features)

(array([array([0.  , 0.75, 0.  , 0.  ]),
        array([0.        , 0.72727273, 0.        , 0.        ]),
        array([0.        , 0.77777778, 0.        , 1.        ]), ...,
        array([0.        , 0.90909091, 0.        , 1.        ]),
        array([0., 1., 0., 1.]),
        array([0.        , 0.57142857, 0.        , 1.        ])],
       dtype=object),
 1103)

In [18]:
outputs = data["Label"].to_numpy()
outputs, len(outputs)

(array(['student', 'ai', 'student', ..., 'student', 'student', 'student'],
       dtype=object),
 1103)

### Divider dados de teste e treino

In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    features, outputs, test_size=0.3
)

In [24]:
x_train

array([array([0., 1., 0., 0.]),
       array([0.        , 0.77777778, 0.        , 1.        ]),
       array([0.        , 0.81818182, 0.        , 0.        ]),
       array([0. , 0.7, 0. , 0. ]),
       array([0.        , 0.77777778, 0.        , 1.        ]),
       array([0.  , 0.75, 0.  , 0.  ]), array([0., 1., 0., 1.]),
       array([0., 1., 0., 1.]), array([0., 1., 0., 1.]),
       array([0.   , 0.625, 0.   , 1.   ]), array([0. , 0.6, 0. , 0. ]),
       array([0., 1., 0., 1.]),
       array([0.        , 0.55555556, 0.        , 0.        ]),
       array([0.   , 0.625, 0.   , 1.   ]), array([0. , 0.8, 0. , 1. ]),
       array([0.   , 0.875, 0.   , 0.   ]),
       array([0.  , 0.75, 0.  , 1.  ]), array([0. , 0.8, 0. , 0. ]),
       array([0.        , 0.66666667, 0.        , 0.        ]),
       array([0.        , 0.90909091, 0.        , 1.        ]),
       array([0.        , 0.77777778, 0.        , 0.        ]),
       array([0.  , 0.75, 0.  , 0.  ]),
       array([0.        , 0.888

### LDA

In [23]:
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
pred = lda.predict(x_test)

ValueError: setting an array element with a sequence.