In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

df = pd.read_csv('profiles.csv')

#print(df.describe())
#print(df.info())
#df.head()

#EDA

#age column
#plt.hist(df['age'])
#plt.show() #based on the histogram, let's consider that everything above 80 years in wrong and drop those rows
df = df[df['age'] < 80]

#body type
# 3 groups with ~20% of the answers each. All the others have less than 10%. Will keep those 3 and aggregate the others

map_body_type = {
    'average': 'average',
    'fit': 'fit', 
    'athletic': 'athletic',  
    'thin': 'other', 
    'curvy': 'other',
    'a little extra': 'other',  
    'skinny': 'other', 
    'full figured': 'other',  
    'overweight': 'other',  
    'jacked': 'other', 
    'used up': 'unknown',  
    'rather not say': 'unknown'
}

df['body_type'] = df['body_type'].map(map_body_type).fillna('unknown')

#diet
map_diet = {
    # Group 1: Anything
    'mostly anything': 'anything',
    'anything': 'anything',
    'strictly anything': 'anything',
    
    # Group 2: Vegetarian/Vegan
    'vegetarian': 'vegetarian',
    'mostly vegetarian': 'vegetarian',
    'strictly vegetarian': 'vegetarian',
    'vegan': 'vegetarian',
    'mostly vegan': 'vegetarian',
    'strictly vegan': 'vegetarian',
    
    # Group 3: Other (includes religious-based + remaining 'other')
    'other': 'other',
    'mostly other': 'other',
    'strictly other': 'other',
    'kosher': 'other',
    'mostly kosher': 'other',
    'strictly kosher': 'other',
    'halal': 'other',
    'mostly halal': 'other',
    'strictly halal': 'other'
}

df['diet'] = df['diet'].map(map_diet).fillna('unknown')
# ~40% of unknowns, ~50% of anything. Might consider dropping this one

#drinks

map_drinks = {
    'socially': 'socially',
    'rarely': 'light',
    'not at all': 'light',
    'often': 'heavy',
    'very often': 'heavy',
    'desperately': 'heavy'
}
df['drinks'] = df['drinks'].map(map_drinks).fillna('unknown')

#socially category dominates with 70%

#drugs
map_drugs = {
    'never': 'no',
    'sometimes': 'yes',
    'often': 'yes',
    'unknown': 'unknown'
}

df['drugs'] = df['drugs'].map(map_drugs).fillna('unknown')

#education

#will divide into education level and is student

education_level_conditions = [
    df['education'].str.contains('college', case = False, na=False),
    df['education'].str.contains('masters', case = False, na=False),
    df['education'].str.contains('high school|ph.d|law school|med school', case = False, na=False) #these represent a low percentage <5%, so will group them
]

education_level_choices = ['college', 'masters', 'other']

df['education_level'] = np.select(education_level_conditions, education_level_choices, default = 'unknown')

#print(df['education_level'].value_counts(normalize = True))

is_student_conditions = [
    df['education'].str.contains('working on', case = False, na=False),
    df['education'].str.contains('graduated from|dropped out', case = False, na=False),
    df['education'].str.contains('space camp', case = False, na=False) #will consider all these unknown    
]

is_student_choices = ['yes', 'no', 'unknown']

df['is_student'] = np.select(is_student_conditions, is_student_choices, default = 'unknown')

df = df.drop('education', axis = 1)

ethnicity_conditions = [
    df['ethnicity'] == 'white',
    df['ethnicity'] == 'asian',
    df['ethnicity'].isna()
]

ethnicity_choices = ['white', 'asian', 'unknown']

df['ethnicity'] = np.select(ethnicity_conditions, ethnicity_choices, default = 'other')

#height
#plt.hist(df['height'])
#plt.show()
df = df[(df['height'] >= 50) & (df['height'] <= 90)] #this is a reasonable interval for height

#income
# 80% of unknowns, not a great column to predict other columns, but might be fun to remove the unknowns and predict this one

#plt.hist(df_income['income'])
#plt.show()

def classify_income(i):
    if i <= 0:
        return 'unknown'
    elif i <= 20000:
        return 'low'
    elif i <= 50000:
        return 'medium'
    else: 
        return 'high'

df['income'] = df['income'].apply(classify_income)

#print(df['income'].value_counts(normalize = True))

#job

job_map = {
    # STEM/Technical
    'science / tech / engineering': 'stem',
    'computer / hardware / software': 'stem',

    # Creative/Artistic
    'artistic / musical / writer': 'creative',
    'entertainment / media': 'creative',

    # Business/Professional
    'sales / marketing / biz dev': 'business',
    'executive / management': 'business',
    'banking / financial / real estate': 'business',
    'law / legal services': 'business',

    # Education/Health
    'education / academia': 'education_health',
    'medicine / health': 'education_health',

    # Manual Labor / Skilled Trades - Not enough percentage to create its own category
    'construction / craftsmanship': 'other',
    'transportation': 'other',
    'clerical / administrative': 'other',

    # Public/Government - Not enough percentage to create its own category
    'political / government': 'other',
    'military': 'other',

    # Other groups
    'student': 'student',
    'unemployed': 'other',
    'retired': 'other',
    'rather not say': 'unknown',
    'hospitality / travel': 'other',
    'other': 'other'
}

df['job'] = df['job'].map(job_map).fillna('unknown')

#last online

df['last_online'] = pd.to_datetime(df['last_online'], format='%Y-%m-%d-%H-%M')
most_recent_date = df['last_online'].max()
df['last_online'] = (most_recent_date - df['last_online']).dt.days

#binary distribution into active or not active. Could also make a semi active category (<=1, <=3, >3)
def categorize_last_online(v):
    if v <= 3:
        return 'active'
    else:
        return 'not active'

df['last_online'] = df['last_online'].apply(categorize_last_online)

#location

def get_city(location):
    city = location.split(',')[0]
    if city == 'san francisco': #50% of the states are san francisco, so will do this and 'other'
        return city
    else:
        return 'other'

df['city'] = df['location'].apply(get_city)

#offspring
df['offspring'] = df['offspring'].fillna('unknown')

#will drop the column because there are 60% of unknowns and spliting between has and wants kids made it worse.

df.drop('offspring', axis = 1)

#orientation

map_orientation = {
    'straight': 'straight',
    'gay': 'queer',
    'bisexual': 'queer'
}

df['orientation'] = df['orientation'].map(map_orientation).fillna('unknown')

#pets
df['pets'] = df['pets'].fillna('unknown')

def likes_dogs(pets):
    if isinstance(pets, str):
        pets = pets.lower().strip()
        if 'likes dogs' in pets or 'has dogs' in pets:
            return 'yes'
    return 'no'

def likes_cats(pets):
    if isinstance(pets, str):
        pets = pets.lower().strip()
        if 'likes cats' in pets or 'has cats' in pets:
            return 'yes'
    return 'no'
        

df['likes_dogs'] = df['pets'].apply(likes_dogs)
df['likes_cats'] = df['pets'].apply(likes_cats)

#religion
df['religion'] = df['religion'].fillna('unknown')

def get_religion(r):
    try:
        religion = r.split(' ')[0]
    except:
        print(r)
        return
    if religion == 'buddhism' or religion == 'hinduism' or religion == 'islam':
        return 'other'
    else:
        return religion

religion_importance_conditions = [
    df['religion'].str.contains('not too serious|laughing', case=False, na=False),
    df['religion'].str.contains('somewhat serious|very serious', case=False, na=False),
    df['religion'].str.contains('unknown', case=False, na=False)
]

religion_importance_options = ['no', 'yes', 'unknown']

df['religion_importance'] = np.select(religion_importance_conditions, religion_importance_options, default = 'unknown')
df['religion'] = df['religion'].apply(get_religion)   

#sex - good to go
#print(df['sex'].value_counts(normalize=True, dropna=False))

#sign
df['sign'] = df['sign'].fillna('unknown')
df['sign'] = df['sign'].str.replace('&rsquo;', '\'', regex=False)

def get_sign(sign):
    if isinstance(sign, str):
        return sign.split(' ')[0]
    else:
        return 'unknown'

sign_importance_conditions = [
    df['sign'].str.contains('fun', regex = False),
    df['sign'].str.contains('but it doesn\'t matter', regex = False),
    df['sign'].str.contains('and it matters a lot', regex = False)
]

sign_importance_options = ['fun', 'not important', 'important']

df['sign_importance'] = np.select(sign_importance_conditions, sign_importance_options, default = 'unknown')
df['sign'] = df['sign'].apply(get_sign)

#print(df['sign'].value_counts(normalize=True, dropna=False)) # 12 categories + unknown -> not sure if it will have predictive power. Might drop it
#print(df['sign_importance'].value_counts(normalize=True, dropna=False))  # only 1% considers important, might drop it

df.drop(['sign', 'sign_importance'], axis = 1)

#smokes
df['smokes'] = df['smokes'].fillna('unknown')

smokes_map = {
    'no': 'no',
    'unknown': 'unknown',
    'sometimes': 'yes',
    'when drinking': 'yes',
    'trying to quit': 'yes',
    'yes': 'yes'
}

df['smokes'] = df['smokes'].map(smokes_map)

#print(df['smokes'].value_counts(normalize=True, dropna=False))

#speaks - has a list of comma separated values with proficiency between brackets
# will get the top 5 languages and save the proficiency level for each + add a column for languages count
df['speaks'] = df['speaks'].fillna('unknown')

def get_languages_count(l):
    count_commas = l.count(',')
    return (count_commas + 1)

df['language_count'] = df['speaks'].apply(get_languages_count)
#print(df['language_count'].value_counts(normalize=True, dropna=False))

#TO DO -> get the 5 top languages and create columns with the level

language_counts = {}
def get_top_languages(lang):
    languages = lang.split(',')
    for language in languages:
        l = language.strip().split(' ')[0]
        if l in language_counts:
            language_counts[l] += 1
        else:
            language_counts[l] = 1

df['speaks'].apply(get_top_languages)
top_5_languages = sorted(language_counts, key=language_counts.get, reverse=True)[:5]
print(top_5_languages)

for lang in top_5_languages:
    df[f'{lang}_level'] = 'none'

for idx, row in df['speaks'].items():
    languages = row.split(',')
    for language in languages:
        record = language.strip().split(' ')
        l = record[0].strip().lower()
        if len(record) == 1:
            level = 'okay'
        elif len(record) >= 2:
            level = record[1].strip('()').lower()
        if l in top_5_languages:
            df.at[idx, f'{l}_level'] = level        

#status
df['status'] = df['status'].fillna('unknown')
#print(df['status'].value_counts(normalize=True, dropna=False))

status_map = {
    'single': 'available',
    'seeing someone': 'unavailable',
    'available': 'available',
    'married': 'unavailable',
    'unknown': 'unknown'
}

df['status'] = df['status'].map(status_map)

#print(df['status'].value_counts(normalize=True, dropna=False))

df = df.drop(['offspring', 'location', 'pets', 'speaks', 'is_student'], axis=1)
not_essay_cols = [col for col in df.columns if 'essay' not in col]
essay_cols = [col for col in df.columns if 'essay' in col]
for col in not_essay_cols:
    if df[col].dtype in ['object', 'category']:
        print(df[col].value_counts(normalize=True))


print(df[not_essay_cols].info())

# Ensure stopwords are downloaded
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    try:
        # Ensure it's a string
        if not isinstance(text, str):
            return ""

        # Remove HTML tags
        text = re.sub(r'<.*?>', ' ', text)

        # Lowercase
        text = text.lower()

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Remove stop words
        words = text.split()
        filtered = [word for word in words if word not in stop_words]

        return ' '.join(filtered)

    except Exception as e:
        print(f"Error processing text: {text}\nError: {e}")
        return ""
    
    

df['essays'] = df[essay_cols].fillna('').agg(' '.join, axis=1)
df['essays'] = df['essays'].apply(preprocess)
print(df['essays'].head())

['english', 'spanish', 'french', 'chinese', 'german']
body_type
other       0.248802
average     0.244561
fit         0.212166
athletic    0.197205
unknown     0.097267
Name: proportion, dtype: float64
diet
anything      0.465260
unknown       0.406833
vegetarian    0.094945
other         0.032962
Name: proportion, dtype: float64
drinks
socially    0.697263
light       0.153957
heavy       0.099187
unknown     0.049593
Name: proportion, dtype: float64
drugs
no         0.629502
unknown    0.234809
yes        0.135689
Name: proportion, dtype: float64
ethnicity
white      0.547999
other      0.255030
asian      0.102359
unknown    0.094612
Name: proportion, dtype: float64
income
unknown    0.808256
high       0.091990
medium     0.050528
low        0.049226
Name: proportion, dtype: float64
job
other               0.209912
business            0.173744
stem                0.159484
unknown             0.143821
education_health    0.120043
creative            0.111594
student             0.08

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PedroMarques\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    would love think kind intellectual either dumb...
1    chef means 1 workaholic 2 love cook regardless...
2    im ashamed much writing public text online dat...
3    work library go school reading things written ...
4    hey hows going currently vague profile know co...
Name: essays, dtype: object


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

def downsample(df, col):
    min_count = df[col].value_counts().min()
    balanced_df = pd.concat([
        df[df[col] == category].sample(n=min_count, random_state=1)
        for category in df[col].unique()
    ])
    return balanced_df

# ----------- Recreate a consistent train/test split ----------- #

print("Starting cell...")

# Create full dataset again for consistent split
full_df = df.copy()
col_to_predict = 'drugs'
full_df = full_df[full_df[col_to_predict] != 'unknown']
full_df = full_df[full_df[col_to_predict].notna()]

print("Starting downsample...")

full_df = downsample(full_df, col_to_predict)

# Label encode target
#le = LabelEncoder()
#full_df[col_to_predict] = le.fit_transform(full_df[col_to_predict])
#y_full = full_df[col_to_predict].values
y_full = full_df[col_to_predict].values

print("Creating cols...")
#create vars with categorical columns and numerical columns
cat_cols = full_df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [col for col in cat_cols if col != col_to_predict and 'essay' not in col]
num_cols = full_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_cols = [col for col in num_cols if col != col_to_predict]

print("Preparing tabular data...")
# Prepare tabular data
df_dummies = pd.get_dummies(full_df[cat_cols], drop_first=True)
X_tab = pd.concat([full_df[num_cols], df_dummies], axis=1)
X_tab = X_tab.reset_index(drop=True)

# Standard scale numeric columns
scaler = StandardScaler()
if len(num_cols) > 0: # check if there are numerical columns
    X_tab[num_cols] = scaler.fit_transform(X_tab[num_cols])

# Prepare text data
X_text = full_df['essays'].fillna("").reset_index(drop=True)

print("Starting split...")

# ----------- Consistent split using index ----------- #

idx_train, idx_test = train_test_split(np.arange(len(full_df)), test_size=0.2, stratify=y_full, random_state=42)

X_tab_train, X_tab_test = X_tab.iloc[idx_train], X_tab.iloc[idx_test]
X_text_train, X_text_test = X_text.iloc[idx_train], X_text.iloc[idx_test]
y_train, y_test = y_full[idx_train], y_full[idx_test]

# ----------- Train Stacking Model (Tabular) ----------- #

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

print("Starting stacking cell...")

base_models = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('et', ExtraTreesClassifier(n_estimators=100)),
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
]

meta_tabular = LogisticRegression(max_iter=1000)
stack = StackingClassifier(estimators=base_models, final_estimator=meta_tabular, cv=5, n_jobs=-1, passthrough=True)

stack.fit(X_tab_train, y_train)
y_pred = stack.predict(X_tab_test)
stack_score = classification_report(y_test, y_pred)
print('Classification report for stacking:')
print(stack_score)


# ----------- Train Text Model (Naive Bayes) ----------- #
print("Starting text stacking cell...")

text_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english', max_features=5000)),
    ('nb', MultinomialNB())
])

text_pipeline.fit(X_text_train, y_train)
y_pred = text_pipeline.predict(X_text_test)
text_pipeline_score = classification_report(y_test, y_pred)
print('Classification report for stacking:')
print(text_pipeline_score)

# ----------- Generate meta-features ----------- #
print("Starting meta features cell...")
# Probabilities from stack model
pred_tab_train = stack.predict_proba(X_tab_train)  # shape [n_samples, n_classes]
pred_tab_test = stack.predict_proba(X_tab_test)

# Probabilities from text model
pred_text_train = text_pipeline.predict_proba(X_text_train)
pred_text_test = text_pipeline.predict_proba(X_text_test)

# Stack horizontally
X_meta_train = np.hstack((pred_tab_train, pred_text_train))
X_meta_test = np.hstack((pred_tab_test, pred_text_test))

# ----------- Train Final Meta-Model ----------- #

final_meta_model = LogisticRegression(max_iter=1000)
final_meta_model.fit(X_meta_train, y_train)

# ----------- Evaluate ----------- #

final_preds = final_meta_model.predict(X_meta_test)
print('🔍 Classification Report for Stacking + Naive Bayes:')
print(classification_report(y_test, final_preds, target_names=sorted(set(y_test))))


Starting cell...
Starting downsample...
Creating cols...
Preparing tabular data...
Starting split...
Starting stacking cell...
Classification report for stacking:
              precision    recall  f1-score   support

          no       0.75      0.77      0.76      1626
         yes       0.76      0.74      0.75      1625

    accuracy                           0.76      3251
   macro avg       0.76      0.76      0.76      3251
weighted avg       0.76      0.76      0.76      3251

Starting text stacking cell...
Classification report for stacking:
              precision    recall  f1-score   support

          no       0.68      0.74      0.71      1626
         yes       0.72      0.66      0.69      1625

    accuracy                           0.70      3251
   macro avg       0.70      0.70      0.70      3251
weighted avg       0.70      0.70      0.70      3251

Starting meta features cell...
🔍 Classification Report for Stacking + Naive Bayes:
              precision    recall

In [None]:
"""
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from xgboost import XGBClassifier
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
#from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore", category=UserWarning) #make sure warning do not appear for the grid search


def downsample(df, col):
    min_count = df[col].value_counts().min()
    balanced_df = pd.concat([
        df[df[col] == category].sample(n=min_count, random_state=1)
        for category in df[col].unique()
    ])
    return balanced_df

data = df[not_essay_cols].copy()
col_to_predict = 'job'
print(data[col_to_predict].value_counts())

#remove rows where the target variable is unknown
data = data[data[col_to_predict] != 'unknown']
data = data[data[col_to_predict].notna()]

data = downsample(data, col_to_predict)

#create vars with categorical columns and numerical columns
cat_cols = data.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [col for col in cat_cols if col != col_to_predict]
num_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_cols = [col for col in num_cols if col != col_to_predict]

df_dummies = pd.get_dummies(data[cat_cols], drop_first = True)

x = pd.concat([data[num_cols], df_dummies], axis = 1)
y = data[col_to_predict]

print(data[col_to_predict].value_counts()) #check if the classes are balanced

le = LabelEncoder()
y = le.fit_transform(y)
print(le.classes_)

# ------------ Chi2 test ------------ #

# Chi2 requires all values to be non-negative
X_chi = x.copy()
X_chi[X_chi < 0] = 0  # Only necessary if you have negative values

selector = SelectKBest(score_func=chi2, k=10)  # Keep top 10 features
X_chi_selected = selector.fit_transform(X_chi, y)
selected_features = X_chi.columns[selector.get_support()]
print(f"Selected features from Chi2: {list(selected_features)}")

x = x[selected_features]

cat_cols = x.select_dtypes(include=['object','category']).columns.tolist()
num_cols = x.select_dtypes(include=['float64', 'int64']).columns.tolist()


# ------------ End of Chi2 test ------------ #

# Split once for both
train_idx, test_idx = train_test_split(np.arange(len(data)), test_size=0.2, stratify=data[col_to_predict], random_state=42)

# Apply same split
x_tab_train = x.iloc[train_idx]
x_tab_test = x.iloc[test_idx]
x_text_train = x_essay[train_idx]
x_text_test = x_essay[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

# the purpose of this split is to have x tabular values (nums and cats) to use in LR, DT, etc.. and also the essays to use in Naive Bayes

scaler = StandardScaler()
if len(num_cols) > 0:
    x_train[num_cols] = scaler.fit_transform(x_train[num_cols])
    x_test[num_cols] = scaler.transform(x_test[num_cols]) #never use fit transform on test data, because it will learn from train data

print(pd.Series(y_train).value_counts(normalize=True))


# --- XGB Classifier --- #

model = XGBClassifier(class_weight='balanced')


param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

gs = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=1, n_jobs=-1)    

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print('Classification report for XGB Classifier:')
print(classification_report(y_test, y_pred))
#print("Best Parameters:", gs.best_params_)

# --- Logistic Regression --- #
model = LogisticRegression(max_iter = 1000, class_weight='balanced')

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [1,3,5,7],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers that support l1 and elasticnet
    'l1_ratio': [0, 0.5, 1]  # Only used with elasticnet
}
gs = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)

model.fit(x_train, y_train)
y_pred_model = model.predict(x_test)
model_score = classification_report(y_test, y_pred_model)
print('Classification report for LR:')
print(model_score)
#print("Best Parameters:", gs.best_params_)

# --- K-Nearest Neighbors ---
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
model_score = classification_report(y_test, y_pred)
print('Classification report for KNN:')
print(model_score)

# --- Decision tree --- #
dtree = DecisionTreeClassifier(max_depth=3, random_state=42, class_weight='balanced')
dtree.fit(x_train, y_train)
y_pred = dtree.predict(x_test)
model_score = classification_report(y_test, y_pred)
print('Classification report for decision tree:')
print(model_score)

# --- Stacking --- #

base_models = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('et', ExtraTreesClassifier(n_estimators=100)),
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
]

meta_model = LogisticRegression()

stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1, passthrough=True)

stack.fit(x_tab_train, y_train)
y_pred = stack.predict(x_tab_test)
model_score = classification_report(y_test, y_pred)
print('Classification report for stacking:')
print(model_score)

# --- Naive Bayes --- #
data = df.copy()

data = data[data[col_to_predict] != 'unknown']
data = data[data[col_to_predict].notna()]

data = downsample(data, col_to_predict)

y = data[col_to_predict]

vectorizer = CountVectorizer(stop_words='english', max_features=5000)
x_essay = vectorizer.fit_transform(data['essays'])

x_essay_train, x_essay_test, y_train, y_test = train_test_split(x_essay, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(x_essay_train, y_train)
y_pred = model.predict(x_essay_test)
print('Classification report for Naive Bayes:')
print(classification_report(y_test, y_pred))

# --- Stack Stacking + Naive Bayes --- #

# Probabilities from stacked models on validation
pred_stack_auto_val = stack.predict_proba(x_test)[:, 1]

# Probabilities from text model
pred_text_val = model.predict_proba(x_essay_test)[:, 1]

# Combine both as new meta-features
X_meta_val = np.hstack((pred_stack_auto_val, pred_text_val))
print(X_meta_val)

meta_model = LogisticRegression()
meta_model.fit(X_meta_val, y_train)

pred_stack_auto_test = stack.predict_proba(x_test)
pred_text_test = model.predict_proba(x_essay_test)

X_meta_test = np.column_stack((pred_stack_auto_test, pred_text_test))
final_preds = meta_model.predict(X_meta_test)
print('Classification report for stacking + Naive Bayes:')
print(classification_report(y_test, final_preds))
"""
