In [33]:
import pandas as pd
from api.database import *
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import re
from collections import Counter
import unicodedata
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
#python -m spacy download corpus_name

<torch._C.Generator at 0x7fe5913c8730>

In [15]:
def test_df_loading():
    d = get_df("SELECT nom, text, x, y from deputes join texts on texts.deputes_id = deputes.id LIMIT 10")
    df = pd.DataFrame(d)
    print(df)

d = get_df("SELECT nom, text, x, y from deputes join texts on texts.deputes_id = deputes.id LIMIT 10")
df = pd.DataFrame(d)
corpus_a = "fr_dep_news_trf"
corpus_e = "fr_core_news_sm"

In [19]:


def separe_to_sentences(text, efficiency = True):
    corpus = corpus_e if efficiency else corpus_a
    nlp = spacy.load(corpus)
    doc = nlp(text)
    assert doc.has_annotation("SENT_START")
    return [unicodedata.normalize("NFKD", sent.text)  for sent in doc.sents]

def process_text(text, efficiency = True):
    text = re.sub(r'\([^)]*\)', '', text)
    corpus = corpus_e if efficiency else corpus_a
    nlp = spacy.load(corpus)
    doc = nlp(text)
    t = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return t

def create_lexic(d2, size = 1000):
    df2 = pd.DataFrame(d2)
    df2["cleaned_text"] = df2["text"].apply(process_text)
    df2["n_words"] = df2["cleaned_text"].apply(len)
    df2 = df2[df2.n_words > 1]
    c = dict(Counter(df2["cleaned_text"].sum()))
    so = sorted(c.items(), key = lambda x:x[1], reverse = True)
    d = dict(so[:size])
    return list(d.keys())


    
def separe_dictionnary_by_sentences(d):
    d2 = {k:[] for k in d.keys()}
    for nom, text, x, y in tqdm(list(zip(d["nom"], d["text"], d["x"], d["y"]))):
        sentences = separe_to_sentences(text, efficiency = True)
        for sentence in sentences:
            if len(sentences) < 15:
                continue
            d2["nom"].append(nom)
            d2["x"].append(x)
            d2["y"].append(y)
            d2["text"].append(sentence)
    return d2

In [None]:
d = get_df("SELECT nom, text, x, y from deputes join texts on texts.deputes_id = deputes.id LIMIT 10")
df = pd.DataFrame(d)
lexic = create_lexic(d, size = 1000)
df["cleaned_text"] = df["text"].apply(process_text)
df["cleaned_text"] = df["cleaned_text"].apply(lambda x:[i for i in x if i in lexic])
df["text"] = df["cleaned_text"].apply(lambda x:" ".join(x))

## Without embedding

In [92]:
d = get_df("SELECT nom, text, x, y from deputes join texts on texts.deputes_id = deputes.id LIMIT 100")
df = pd.DataFrame(d)
df["cleaned_text"] = df["text"].apply(process_text)
#lexic = create_lexic(d, size = 1000)
#df["cleaned_text"] = df["cleaned_text"].apply(lambda x:[i for i in x if i in lexic])
df["text"] = df["cleaned_text"].apply(lambda x:" ".join(x))

In [93]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list(df["text"]))

In [94]:
vectorizer.transform([df["text"][0]])

<1x2772 sparse matrix of type '<class 'numpy.float64'>'
	with 134 stored elements in Compressed Sparse Row format>

In [95]:
vectorizer.get_feature_names_out().shape

(2772,)

In [96]:
class TFIDF_NN(nn.Module):

    def __init__(self, vect, hidden_size = 100):
        super(TFIDF_NN, self).__init__()
        self.vectorizer = vect
        vocab_size = vect.get_feature_names_out().shape[0]
        self.linear1 = nn.Linear(vocab_size, hidden_size)
        self.output = nn.Linear(hidden_size, 2)


    def forward(self, text):
        inputs = torch.tensor(self.vectorizer.transform(text).toarray()).float()
        h1 = F.relu(self.linear1(inputs))
        output = self.output(h1)
        return output


In [100]:
model = TFIDF_NN(vectorizer)

loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(10000):
    y_pred = model(df["text"])
    y_true = torch.tensor(np.array(df[["x", "y"]])).float()
    loss = loss_fn(y_pred, y_true)
    if t % 250 == 0:
        print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 25718920.0
250 792703.8125
500 400268.78125
750 345150.09375
1000 286165.375
1250 217393.375
1500 140575.875
1750 68729.140625
2000 22814.84765625
2250 5327.1279296875
2500 1045.66650390625
2750 190.31539916992188
3000 30.032869338989258
3250 3.7385342121124268
3500 0.3447841703891754
3750 0.022456083446741104
4000 0.0009877184638753533
4250 3.16251753247343e-05
4500 6.0757774917874485e-06
4750 3.5873563319910318e-06
5000 2.4432411009911448e-06
5250 1.7495185602456331e-06
5500 1.4415163605008274e-06
5750 1.2026939657516778e-06
6000 8.386195986531675e-07
6250 6.79257937008515e-07
6500 6.435911927837878e-07
6750 5.500369297806174e-07
7000 4.2026658775284886e-07
7250 3.0630690162070096e-07
7500 9.318500815425068e-07
7750 4.7309004003182054e-07
8000 2.7687416076660156
8250 2.6683846954256296e-07
8500 2.6197085389867425e-07
8750 3.2390380511060357e-07
9000 3.5832636058330536e-07
9250 0.0001452863507438451
9500 2.713495632633567e-07
9750 2.563392627052963e-07


array([[188.561735, 362.14572 ],
       [188.561735, 362.14572 ],
       [188.561735, 362.14572 ],
       [188.561735, 362.14572 ],
       [188.561735, 362.14572 ],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [211.30841 ,  84.773225],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [587.06696 , 403.45715 ],
       [408.21438 , 120.81297 ],
       [40