In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
%matplotlib inline

### Explorando os dados

In [4]:
data = pd.read_csv("./LLM.csv")

In [5]:
data.head()

Unnamed: 0,Text,Label
0,y r u always l8 to the meetings?,student
1,The project team embraced a user-centric desig...,ai
2,"i dont like dealing with risks, it's too stres...",student
3,"i dont worry about reliability, it's good enough",student
4,"i dont care about human-centered design, just ...",student


### Divider dados de teste e treino

In [7]:
features = data["Text"].to_numpy()
features

array(['y r u always l8 to the meetings?',
       'The project team embraced a user-centric design approach for product development.',
       "i dont like dealing with risks, it's too stressful", ...,
       'i dont care about multi-method approaches, just give me the results',
       'i dont understand variables, just give me the answer',
       "i dont worry about encryption, it's overrated"], dtype=object)

In [8]:
outputs = data["Label"].to_numpy()
outputs

array(['student', 'ai', 'student', ..., 'student', 'student', 'student'],
      dtype=object)

In [9]:
x_train_raw, x_test_raw, y_train_raw, y_test_raw = train_test_split(
    features, outputs, test_size=0.3
)
x_train_raw[:5], x_test_raw[:5], y_train_raw[:5], y_test_raw[:5]

(array(["i dont like artificial intelligence, it's too unpredictable",
        "i dont like agile approaches, they're too unpredictable",
        'In the world of robotics, advancements in artificial intelligence enable more complex tasks.',
        'i dont need practical implications, just tell me what to do',
        'The findings have practical implications for real-world applications.'],
       dtype=object),
 array(['The data collection process was systematic and thorough.',
        'The analysis considered the correlation between social media usage and mental health.',
        'i dont talk about limitations, it makes me look bad',
        'The data analysis software used is widely accepted in the field.',
        'The research design ensured data privacy and confidentiality.'],
       dtype=object),
 array(['student', 'student', 'ai', 'student', 'ai'], dtype=object),
 array(['ai', 'ai', 'student', 'ai', 'ai'], dtype=object))

In [11]:
np.where((y_test_raw != 'ai') & (y_test_raw != 'student'))

(array([], dtype=int64),)

In [12]:
y_test_raw

array(['ai', 'ai', 'student', 'ai', 'ai', 'ai', 'ai', 'student',
       'student', 'student', 'ai', 'student', 'student', 'ai', 'ai',
       'student', 'student', 'student', 'student', 'ai', 'ai', 'ai',
       'student', 'ai', 'student', 'student', 'student', 'student', 'ai',
       'student', 'student', 'student', 'student', 'ai', 'student', 'ai',
       'ai', 'ai', 'student', 'student', 'student', 'student', 'student',
       'student', 'ai', 'ai', 'student', 'ai', 'student', 'ai', 'student',
       'student', 'student', 'ai', 'student', 'student', 'student',
       'student', 'student', 'student', 'student', 'student', 'student',
       'ai', 'ai', 'ai', 'ai', 'ai', 'ai', 'ai', 'student', 'ai',
       'student', 'ai', 'student', 'ai', 'ai', 'student', 'student',
       'student', 'ai', 'student', 'student', 'ai', 'ai', 'student',
       'student', 'ai', 'ai', 'ai', 'student', 'student', 'student', 'ai',
       'student', 'student', 'student', 'student', 'ai', 'ai', 'ai', 'ai',
     

### Tratamento dos Dados

Isso será necessário porque o LDA não aceita matrizes "sparse"; precisamos convertê-las em "dense" antes.

In [None]:
class DetectAITransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.X = None

    def _parse_text(text: str) -> np.ndarray:
        has_dont = 0
        starts_upper = 0
        has_comma = 0

        right_words = 0
        total_words = 0

        words = text.split()
        for word in words:
            if word == "don't":
                has_dont = 1

            if len(word) > 0:
                starts_upper = 1 if word[0].isupper() else 0

                if word[-1] == ",":
                    has_comma = 1

            if word.lower().rstrip(",".rstrip("'s")) in words_list:
                right_words += 1

            total_words += 1

        return np.array([has_dont, right_words / total_words, starts_upper, has_comma])
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        

In [101]:
class ToDenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.toarray()

In [102]:
text_pipeline = Pipeline(
    [
        ("vectorize_text", CountVectorizer()),
        ("dense_matrix", ToDenseTransformer())
    ]
)
text_pipeline

In [103]:
x_train = text_pipeline.fit_transform(x_train_raw)
x_train

: 

In [72]:
x_test = text_pipeline.transform(x_test_raw)
x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [77]:
label_encoder = LabelEncoder()

In [90]:
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)
y_train, y_test

(array([1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
        0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 

In [91]:
label_encoder.classes_

array(['ai', 'student'], dtype=object)

### LDA

In [92]:
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
pred = lda.predict(x_test)

ValueError: X has 641 features, but LinearDiscriminantAnalysis is expecting 862 features as input.