#  Data visualisation

In [None]:
# rajouter le nombre des mts dans les features


In [1]:
! pip install tqdm
! pip install nltk
! pip install transformers
! pip install scikit-learn



In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from multiprocessing import Pool
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
%matplotlib inline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


pd.set_option("display.max_rows", 400)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mouha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mouha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mouha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [3]:

print(os.path.exists("training.csv"))

True


In [4]:
training_data_folder = "training"
annotations_file = "training_labels.json"
training_csv = "training.csv"

In [5]:



tags = set()
def get_tags_from_files (): #Run only once
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]

    for txt_file in txt_files:
        with open(os.path.join(training_data_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):
            if line:
                tag = line.split(" ")[1]
                if not tag in tags:
                    tags.add (tag)
#get_tags_from_files()
tags = ['Acknowledgement',
 'Alternation',
 'Background',
 'Clarification_question',
 'Comment',
 'Conditional',
 'Continuation',
 'Contrast',
 'Correction',
 'Elaboration',
 'Explanation',
 'Narration',
 'Parallel',
 'Q-Elab',
 'Question-answer_pair',
 'Result']


In [6]:
#TFIDF computing

def filter_special_characters (text):
    regex = r'[^a-zA-Z0-9\s.]'
    text = re.sub(regex,'',text)
    return text

def keep_only_noun_and_verbs (text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return " ".join(pos_tagged_noun_verb)

def tokenize_and_filter_stopwords(text):
    # Tokenize the text
    text = filter_special_characters (text)
    text = keep_only_noun_and_verbs(text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    return filtered_words

def frequency (token, tokens):
    return len([t for t in tokens if t==token])/len(tokens)

def inverse_document_frequency (token, tokenized_sentences):
    d = len(tokenized_sentences)
    presence = len([sentence for sentence in tokenized_sentences if token in sentence[0]])
    return d/presence

def tfidf (tokenized_sentence, tokenized_sentences):
    #tokenized_sentence = tokenize_and_filter_stopwords(sentence)
    #tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    words = set (tokenized_sentence)
    words_scores = {}
    for word in words:
        tfidf_ = frequency(word,words)*(np.log(1+inverse_document_frequency(word,tokenized_sentences)))
        words_scores[word] = tfidf_
    return words_scores


def sentencize (text):

    sentences = sent_tokenize(text)
    tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    indexes = range(0,len(sentences))
    return list(zip(sentences,indexes)), list(zip(tokenized_sentences,indexes))

def sentences_scores (sentences ):
    sentences_scores = {}
    for sentence,index in sentences:
        words_scores = tfidf(sentence,sentences)
        score = sum([words_scores[word] for word in sentence])
        sentences_scores[" ".join(sentence)] = (score, index)
    return dict((sentences_scores.items()))


def extractive_summary (sentences, threshold):

    tokenized_sentences = list(zip([tokenize_and_filter_stopwords(sentence) for sentence in sentences],
                                   range(len(sentences))))

    sentences_scores_ = list(sentences_scores (tokenized_sentences).items())

    sentences_indexes = [sentence_and_score[1][1] for sentence_and_score in  sentences_scores_ if sentence_and_score[1][0]>=threshold]
    sentences_indexes.sort()
    summary = [sentences[index] for index in sentences_indexes]

    return summary
    #print(summary)


def tfidf_sentence_scores (sentences):
    tokenized_sentences = list(zip([tokenize_and_filter_stopwords(sentence) for sentence in sentences],range(len(sentences))))
    sentences_scores_ = sentences_scores (tokenized_sentences)
    return sentences_scores_



In [7]:
#Loading training data from json and txt files. 

def load_training_data ():

    with open(annotations_file, 'r') as file:
        annotations = json.load(file)

    json_files = [f for f in os.listdir(training_data_folder) if f.endswith('.json')]
    json_files.sort()
    dfs = []
    for json_file in json_files:
        file_path = os.path.join(training_data_folder, json_file)
        with open(file_path, 'r') as file:
            data = pd.json_normalize(json.load(file))
        shortname = json_file.split(".")[0]
        data["file"] = shortname
        relevance = annotations[shortname]
        data["relevance"] = relevance
        dfs.append (data)

    df = pd.concat (dfs, ignore_index=True)
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    for tag in tags:
        df[tag] = 0
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    txt_files.sort()
    print("extraction des données du graphe (cela va prendre un certain temps)")
    for i,txt_file in tqdm(enumerate(txt_files)):
        shortname = txt_file.split(".")[0]
        with open(os.path.join(training_data_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):

            if line:
                items = line.split(" ")
                tag = items[1]
                referenced = items[2]
                df.loc[(df['index'] == int(referenced)) & (df['file'] == shortname), tag] = 1
    return df

def get_files ():
    return list(set(df["file"].values.tolist()))

def add_tfidf_scores ():
    files = sorted(get_files())
    for file in tqdm(files):
        sentences = df[df["file"]==file]["text"].values.tolist()
        sentences_and_scores = tfidf_sentence_scores (sentences)
        scores = [0] * len(sentences)
        for score,index in sentences_and_scores.values():
            scores[index] = score

        df.loc[df["file"] == file, "score"] = scores
        
        
def split_dataset (dataset, split,testing_size):
  training_and_val_size = dataset.shape[0] - testing_size
  df = dataset[0:training_and_val_size]
  train_df, val_df = train_test_split(df, test_size=split, random_state=42)
  test_df = shuffled_df[training_and_val_size:]
  return train_df, val_df, test_df

In [8]:
#You only need to run load_training_data once
if os.path.exists(training_csv):
    df = pd.read_csv (training_csv)
else:
    print(f"Génération de {training_csv}")
    df = load_training_data ()
    add_tfidf_scores()
    df.to_csv(training_csv,index=False)

original_df = df
shuffled_df = original_df.sample(frac=1,random_state=42)

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,speaker,text,index,file,relevance,Acknowledgement,Alternation,Background,Clarification_question,...,Contrast,Correction,Elaboration,Explanation,Narration,Parallel,Q-Elab,Question-answer_pair,Result,score
0,0,PM,Okay,0,ES2002a,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,1,PM,Right,1,ES2002a,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,2,PM,<vocalsound> Um well this is the kick-off meet...,2,ES2002a,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.954745
3,3,PM,Um <vocalsound> and um,3,ES2002a,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,3.531381
4,4,PM,this is just what we're gonna be doing over th...,4,ES2002a,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,4.949238


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import numpy as np



In [11]:

testing_size = 10000
split = 0.16 #iftraining bert, else 0.8 for just tfidf and classifier because 0.16 takes to much time to execute
split_text = "016"

train_df, val_df, test_df = split_dataset (shuffled_df, split,testing_size)

print("training : ", train_df.shape[0])
print("validation : ", val_df.shape[0])
print("training + validation : ", train_df.shape[0]+val_df.shape[0])
print("test : ", test_df.shape[0])


training :  12524
validation :  50099
training + validation :  62623
test :  10000


In [15]:

def compute_sample_weights(y, class_weights):
    sample_weights = [class_weights[label] for label in y]
    return sample_weights



train_df = train_df [0:10000]

additional_features = tags + ["score"]
max_embedding = 300

class_weights = [1,3]
df = shuffled_df
x_train_sentences = train_df ["text"].values.tolist()
y_train = train_df["relevance"].values.tolist()

vectorizer = TfidfVectorizer(max_features=max_embedding)
x_train_vectorized = vectorizer.fit_transform(x_train_sentences)

print("fit and transform done")

x_train_combined = np.concatenate([x_train_vectorized.toarray(), train_df[additional_features]], axis=1)
print("test transform done")
print("training samples : ", len(y_train))
print("embedding dimension : ", x_train_combined.shape[1])


classifier = SVC(kernel='linear',class_weight={0:class_weights[0],1:class_weights[1]})
classifier.fit(x_train_combined, y_train)

print("classifier trained")

x_val_sentences = val_df ["text"].values.tolist()
y_val = val_df["relevance"].values.tolist()

x_val_vectorized = vectorizer.transform(x_val_sentences)

x_val_combined = np.concatenate([x_val_vectorized.toarray(), val_df[additional_features]], axis=1)




predictions = classifier.predict(x_val_combined)

print("Classification Report:")
print(classification_report(y_val, predictions))
#size = size*2

print("ADA")

sample_weights = compute_sample_weights(y_train, class_weights)
ada_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_classifier.fit(x_train_combine, y_val, sample_weight=sample_weights)

ada_predictions = ada_classifier.predict(x_val_combined)

print("Classification Report:")
print(classification_report(y_val, ada_predictions))


fit and transform done
test transform done
training samples :  10000
embedding dimension :  317
classifier trained
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.78      0.85     40941
           1       0.43      0.74      0.54      9158

    accuracy                           0.77     50099
   macro avg       0.68      0.76      0.69     50099
weighted avg       0.84      0.77      0.79     50099



NameError: name 'mean_svc' is not defined