In [106]:
import nltk
import re
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.metrics import mean_squared_error , r2_score


## Data Processing

In [90]:
def remove_special_chars(text):
  # Regular expression to match all characters except alphanumeric (a-z, A-Z, 0-9) and underscore
    text = re.sub(r'\b\S*\.com\S*\b', '', text, flags=re.MULTILINE)
    # Remove links starts with http/https/www
    text = re.sub(r'http\S+\.com\S*|www\S+\.com\S*|https\S+\.com\S*', '', text, flags=re.MULTILINE)
    # Remove words starting with # or @
    text = re.sub(r'(@|#)\w+', '', text)
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase and remove extra spaces
    cleaned_text = ' '.join(text.lower().split())
    return cleaned_text
def preprocess_text(text):
  text = remove_special_chars(text)
  # Tokenization (sentence -> words)
  tokens = nltk.word_tokenize(text)

  # Lowercase

  # Stop word removal
  stopwords = nltk.corpus.stopwords.words('english')
  tokens = [token for token in tokens if token not in stopwords]

  # Lemmatization
  lemmatizer = nltk.stem.WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  return tokens

In [91]:
dataset = pd.read_csv('answers.csv')
print(len(dataset))
dataset.head()

2442


Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0


In [92]:
dataset = dataset.drop_duplicates()
print(len(dataset))

2341


In [95]:
dataset['answer_tokenized'] = dataset['answer'].apply(preprocess_text)
dataset.head()

Unnamed: 0,id,answer,score,correct,answer_tokenized
0,1.1,High risk problems are address in the prototyp...,3.5,0.0,"[high, risk, problem, address, prototype, prog..."
1,1.1,To simulate portions of the desired final prod...,5.0,1.0,"[simulate, portion, desired, final, product, q..."
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0,"[prototype, program, simulates, behavior, port..."
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0,"[defined, specification, phase, prototype, sti..."
4,1.1,It is used to let the users have a first idea ...,3.0,0.0,"[used, let, user, first, idea, completed, prog..."


In [96]:
dataset = dataset[dataset['answer_tokenized'].apply(lambda x: len(x) >= 4)]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2015 entries, 0 to 2441
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2015 non-null   float64
 1   answer            2015 non-null   object 
 2   score             2015 non-null   float64
 3   correct           2015 non-null   float64
 4   answer_tokenized  2015 non-null   object 
dtypes: float64(3), object(2)
memory usage: 94.5+ KB


## TF-IDF

In [101]:
dataset['answer_processed'] = dataset['answer_tokenized'].apply(lambda x: ' '.join(x))

# Initialiser le TfidfVectorizer
vectorizer = TfidfVectorizer()

# Appliquer le TF-IDF sur les textes prétraités
tfidf_matrix = vectorizer.fit_transform(dataset['answer_processed']).toarray()
print((tfidf_matrix))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Bag Of words

In [104]:
vectorizer1 = CountVectorizer()
bow_representation = vectorizer1.fit_transform(dataset['answer_processed']).toarray()
print(bow_representation)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Word Embedding

In [80]:

# Assurez-vous que votre colonne 'answers_tokenized' contient les tokens pour chaque réponse
sentences = dataset['answer_tokenized'].tolist()
word2vec_model = Word2Vec(sentences=sentences, vector_size=50, window=5, min_count=1, workers=4)
# 2. Convertir chaque séquence de tokens en vecteurs Word2Vec

def embed_answer(answer_tokens):
    embedded_vectors = []
    for token in answer_tokens:
        try:
            embedded_vectors.append(word2vec_model.wv[token])
        except KeyError:
            embedded_vectors.append(np.zeros(word2vec_model.vector_size))

    # Combine token vectors (e.g., average or sum)
    return np.mean(embedded_vectors, axis=0)  # Return a NumPy array

# Appliquer la fonction 'embed_answer' à chaque séquence de tokens
dataset['answer_embedded'] = dataset['answer_tokenized'].apply(embed_answer)
embedded_vectors = np.array(dataset['answer_embedded'].tolist())



In [81]:
dataset.head()

Unnamed: 0,id,answer,score,correct,answer_tokenized,answer_embedded
0,1.1,High risk problems are address in the prototyp...,3.5,0.0,"[high, risk, problem, address, prototype, prog...","[0.15536742, -0.014855018, 0.1431846, -0.08290..."
1,1.1,To simulate portions of the desired final prod...,5.0,1.0,"[simulate, portion, desired, final, product, q...","[0.12358978, -0.0084962575, 0.1115971, -0.0642..."
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0,"[prototype, program, simulates, behavior, port...","[0.1328473, -0.011225945, 0.12466898, -0.06936..."
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0,"[defined, specification, phase, prototype, sti...","[0.12641999, -0.009804486, 0.11310876, -0.0657..."
4,1.1,It is used to let the users have a first idea ...,3.0,0.0,"[used, let, user, first, idea, completed, prog...","[0.10322445, -0.012190971, 0.09594863, -0.0523..."


## Models "Regression"

In [82]:
from sklearn.model_selection import train_test_split
y = dataset['score']
X = embedded_vectors

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [105]:
from sklearn.preprocessing import StandardScaler
y = dataset['score']
X = embedded_vectors  # Use the embedded vectors directly

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the SVR model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Predict scores on the test set
y_pred = svr_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error SVR:", mse)
print("R-squared SVR:", r2)

Mean Squared Error SVR: 1.1344749611414648
R-squared SVR: -0.08477127805937679


In [107]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict scores on the test set
y_pred = linear_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error LinearRegression:", mse)
print("R-squared LinearRegression:", r2)

Mean Squared Error LinearRegression: 0.9568190000691131
R-squared LinearRegression: 0.08510120969779555


In [108]:
from sklearn.tree import DecisionTreeRegressor

# Initialiser et entraîner le modèle Decision Tree Regression
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)

# Prédire les scores sur l'ensemble de test
y_pred_decision_tree = decision_tree_model.predict(X_test)

# Évaluer les performances du modèle
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)

print("Mean Squared Error Decision Tree Regression:", mse_decision_tree)
print("R-squared Decision Tree Regression:", r2_decision_tree)


Mean Squared Error Decision Tree Regression: 1.774968982630273
R-squared Decision Tree Regression: -0.6972039381691546


## Interpretation

Comparons les résultats de régression entre différents modèles, notamment la régression linéaire, la régression par arbre de décision et la régression à vecteurs de support (SVR).

1. **Régression par arbre de décision**:
   - Erreur quadratique moyenne (Mean Squared Error): 1.775
   - Coefficient de détermination (R-squared): -0.697
   - L'erreur quadratique moyenne mesure l'écart moyen au carré entre les valeurs prédites et les valeurs réelles. Une valeur plus faible est souhaitable.
   - Le coefficient de détermination, également appelé R-squared, mesure la proportion de la variance dans la variable dépendante qui est prévisible à partir de la variable indépendante. Une valeur plus proche de 1 est préférable, indiquant un meilleur ajustement du modèle.

2. **Régression linéaire**:
   - Erreur quadratique moyenne: 0.957
   - Coefficient de détermination: 0.085
   - L'erreur quadratique moyenne est plus faible que celle de la régression par arbre de décision, indiquant un ajustement légèrement meilleur.
   - Le coefficient de détermination est également plus élevé, bien que toujours relativement faible, indiquant que le modèle linéaire explique mieux la variance dans les données par rapport à l'arbre de décision.

3. **SVR (Régression à vecteurs de support)**:
   - Erreur quadratique moyenne: 1.134
   - Coefficient de détermination: -0.085
   - L'erreur quadratique moyenne est intermédiaire entre la régression linéaire et l'arbre de décision, mais le coefficient de détermination est négatif, ce qui indique que le modèle ne s'ajuste pas bien aux données.

**Interprétation**:
- Dans ce cas, la régression linéaire semble être le meilleur choix, car elle présente une erreur quadratique moyenne plus faible et un coefficient de détermination légèrement plus élevé par rapport aux autres modèles.
- Cependant, il est important de noter que l'interprétation des résultats de régression doit être effectuée avec prudence, en tenant compte des spécificités du domaine et des données.