In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np


In [12]:

# Load the DataFrame from the pickle file
submissions = pd.read_pickle('output/openai_embedded_large_all.pkl')

In [13]:

# Preprocess the labels
label_encoder = LabelEncoder()

submissions['link_flair_text'] = label_encoder.fit_transform(submissions['link_flair_text'])
num_classes = len(label_encoder.classes_)


In [14]:

# Convert the embeddings and labels to arrays
X = np.stack(submissions['embedding'].values)
y = submissions['link_flair_text'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)




In [15]:
mlp = Pipeline([
    ('mlp', MLPClassifier(hidden_layer_sizes=(30,20),
                        solver='adam',
                        max_iter=1000, 
                        learning_rate='constant' ,
                        early_stopping=True ,
                        validation_fraction=0.2),
                        )
])

rf = Pipeline([
    ('rf', RandomForestClassifier(n_estimators=150,max_depth=6 , min_samples_leaf=30, criterion='gini'))
])

lr =  Pipeline([
    ('lr', LogisticRegression(max_iter=1000))
])



# Define the ensemble model
ensemble = VotingClassifier(estimators=[
    ('mlp', mlp),
    ('rf' , rf),
    ('lr' , lr),

], voting='hard')

ensemble.fit(X_train, y_train)

# Make predictions on the test set
y_pred_train = ensemble.predict(X_train)
y_pred_test = ensemble.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f'Accuracy of the model on the train set: {accuracy_train * 100:.2f}%')
print(f'Accuracy of the model on the test set: {accuracy_test * 100:.2f}%')

Accuracy of the model on the train set: 75.78%
Accuracy of the model on the test set: 72.16%


In [16]:
# Define the model
knn_model = KNeighborsClassifier(n_neighbors=100) 
# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model on the test set: {accuracy * 100:.2f}%')


Accuracy of the model on the test set: 64.82%


In [17]:


model = Pipeline([
    # ('scaler', StandardScaler()),
    ('gb', GradientBoostingClassifier(n_estimators=100,loss='exponential' , max_depth=2))
])


model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f'Accuracy of the model on the train set: {accuracy_train * 100:.2f}%')
print(f'Accuracy of the model on the test set: {accuracy_test * 100:.2f}%')

In [None]:
model = Pipeline([
    ('rf', RandomForestClassifier(n_estimators=150,max_depth=6 , min_samples_leaf=30, criterion='gini'))
])

# Train the model
model.fit(X_train, y_train)


# Make predictions on the test set
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f'Accuracy of the model on the train set: {accuracy_train * 100:.2f}%')
print(f'Accuracy of the model on the test set: {accuracy_test * 100:.2f}%')



NameError: name 'X_train' is not defined

In [None]:
X = submissions['selftext'].values # NOT the embeddings this time
y = submissions['link_flair_text'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)


# Define the neural network

# Define the pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()), # gets features out of text (like our embeddings)
    ('mlp', MLPClassifier(hidden_layer_sizes=(30,20),
                        activation='logistic',
                        solver='adam',
                        max_iter=1000, 
                        learning_rate='constant' ,
                        early_stopping=True ,
                        validation_fraction=0.2),
                        )
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f'Accuracy of the model on the train set: {accuracy_train * 100:.2f}%')
print(f'Accuracy of the model on the test set: {accuracy_test * 100:.2f}%')



# re fill our x and y with the embeddings
X = np.stack(submissions['embedding'].values)
y = submissions['link_flair_text'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)


