# Non model based

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
here = os.getcwd()
project_path = os.path.dirname(here)
data_path = os.path.join(project_path, "data", "session_speech.csv")

In [None]:
data = pd.read_csv(data_path, converters={"speech":eval})
data.columns

In [None]:
data.shape

In [None]:
data = data.explode(column="speech")
data = data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
data.shape

## Difference of frequencies

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(lowercase=True)
X = vectorizer.fit_transform(data.speech)
X = X.toarray()
frequencies = (
    pd
    .DataFrame(
        X, columns=vectorizer.get_feature_names_out(), index=data["vote"]
    )
    .rename_axis("senator_vote", axis=0)
    .reset_index()
    .groupby("senator_vote")
    .sum()
)

In [None]:
frequencies

In [None]:
count_total = frequencies.sum(axis=0)
count_difference = frequencies.loc["positivo"]-frequencies.loc["negativo"]
count_diff = (
    pd
    .DataFrame({
        "diff": count_difference,
        "total": count_total,
        "pos": frequencies.loc["positivo"],
        "neg": frequencies.loc["negativo"]
    })
    .rename_axis("word", axis=0)
    .reset_index()
)
count_diff.head()

## Difference of proportions

In [None]:
proportions = frequencies/frequencies.sum(axis=0)
proportions_difference = proportions.loc["positivo"]-proportions.loc["negativo"]
proportions_diff = (
    pd
    .DataFrame({
        "diff": proportions_difference,
        "pos": proportions.loc["positivo"],
        "neg": proportions.loc["negativo"]
    })
    .rename_axis("word", axis=0)
    .reset_index()
)
proportions_diff.head()

## Correction: removing stop words

## Odds

In [None]:
odds_pos = frequencies.loc["positivo"]/frequencies.loc["negativo"]
odds_neg = frequencies.loc["negativo"]/frequencies.loc["positivo"]
odds_difference = odds_pos/odds_neg
odds_diff = (
    pd
    .DataFrame({
        "diff": odds_difference,
        "pos": odds_pos,
        "neg": odds_neg
    })
    .rename_axis("word", axis=0)
    .reset_index()
)
odds_diff.head()

## Log-odds-ratio

Agregar suavizado, pero cuidado: no se puede agregar peso sin más a una frecuencia (la suma ya no va a dar 1).

In [None]:
from numpy import log

In [None]:
odds_diff["log_odds"] = log(odds_diff["diff"])
odds_diff.head()

## Correction: elimating low-frequency words

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(lowercase=True)
X = vectorizer.fit_transform(data.speech)
X = X.toarray()
tfidf = (
    pd
    .DataFrame(
        X, columns=vectorizer.get_feature_names_out(), index=data["vote"]
    )
    .rename_axis("senator_vote", axis=0)
    .reset_index()
    .groupby("senator_vote")
    .sum()
)
tfidf.head()

In [None]:
corr_pos = tfidf.loc["positivo"].corr(proportions_total)
corr_neg = tfidf.loc["negativo"].corr(proportions_total)
print(f"""
Correlación con proporciones:
    - POSITIVO: {round(corr_pos,2)}
    - NEGATIVO: {round(corr_neg,2)}
""")

In [None]:
tfidf.loc["negativo"].corr(proportions_total)

## WordScores

Ideas:
- en esta nb, hacer las visualizaciones propuestas por el paper para todos los casos
- en la nb de clasificación voy a necesitar features, puedo elegir alguna de estas técnicas visualizando con 
- 