In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = os.getcwd()
raw_data = pd.read_csv(path+"/train.csv")
raw_data["Vinyl"].value_counts(dropna=False)
raw_data.head()

In [None]:
path = os.getcwd()
raw_data_test = pd.read_csv(path+"/test.csv")
raw_data_test["Vinyl"].value_counts(dropna=False)
raw_data_test.head()

In [None]:
sns.histplot(raw_data, x="AvgTime")
sns.histplot(raw_data_test, x="AvgTime", color="orange")

f=sns.histplot(raw_data, x="Food")
f.set_xscale('log')

In [None]:
a=sns.histplot(raw_data, x="Age")
b=sns.histplot(raw_data_test, x="Age", color='orange')

d=sns.histplot(raw_data, x="Drinks")
e=sns.histplot(raw_data_test, x='Drinks', color='orange')
d.set_xscale('log')
d.set_yscale('log')

album_pass = sns.countplot(data=raw_data,x="PreferedAlbum",hue="FreePass")

In [None]:
raw_data2 = raw_data[['Food','Drinks',"FreePass","AvgTime","Vinyl"]].fillna(value=0.0)
col = raw_data['Age'].fillna(value=raw_data['Age'].mean())
raw_data2['Age']=col
raw_data2['logDrinks']=np.log(raw_data2['Drinks'])
raw_data2['logFood']=np.log(raw_data2['Food'])
food_pass = sns.histplot(data=raw_data2, x="Food", hue="FreePass", kde=True, element='step', stat='count')
food_pass.set_xscale('log')
sns.histplot(raw_data2, x="Vinyl", hue="FreePass")
drink_pass = sns.histplot(data=raw_data2, x="Drinks", hue="FreePass", kde=True, element='step', stat='count')
drink_pass.set_xscale('log')
age_pass = sns.histplot(data=raw_data,x="Age", hue="FreePass",kde=True, element='step',stat='count')

In [None]:
corr_matrix=pd.DataFrame.corr(raw_data2)
sns.heatmap(corr_matrix, vmin=-1,vmax=1,cmap='viridis',)
corr_matrix

In [None]:
jntplt2 = sns.histplot(raw_data2, x='Age', y='logDrinks', hue='FreePass', multiple='layer')
jntplt2.set_xlim(0,100)

jnt_fooddrink = sns.JointGrid(data=raw_data, x=raw_data['PreferedAlbum'].astype('category'), y='Food', hue='FreePass')
jnt_fooddrink.plot(sns.histplot, sns.histplot)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import textblob as tb

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

def preprocess_text(text):
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words('english')]

    return ' '.join(tokens)

------------------------------------------
## ML Models and Predicting FreePass ##

Linear Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier


In [None]:
def split_ticket(ticket):
        try:
            parts = str(ticket).split('/')
            # Returns (TicketType, TicketClass)
            # Example: 'CB/734/XL' -> 'CB', 'XL'
            return parts[0], int(parts[1]), parts[-1]
        except:
            return None, 0, None

sia=SentimentIntensityAnalyzer()
def vader_polarity(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    return sia.polarity_scores(text)["compound"]


def preprocess_csv(path, save=False, filename=""):
    data = pd.read_csv(path)

    # Fill NaNs #
    # Numerical Columns with Median #
    print(f"Missing values: {data.isnull().sum().sum()}")
    print(f"Missing values in Age: {data['Age'].isnull().sum()}") 
    data['Age'] = data['Age'].fillna(data['Age'].median())
    # Categorical Columns with Mode#
    print(f"Missing values in Concert: {data['Concert'].isnull().sum()}")
    print(f"Missing values in TicketInfo: {data['TicketInfo'].isnull().sum()}")
    print(f"Missing values in PreferedAlbum: {data['PreferedAlbum'].isnull().sum()}")
    print(f"Missing values in Vinyl: {data['Vinyl'].isnull().sum()}")
    print(f"Missing values in VIP: {data['VIP'].isnull().sum()}")
    cat_cols = ['Concert', 'TicketInfo', 'PreferedAlbum', 'Vinyl', 'VIP']
    for col in cat_cols:
        # Mode returns a series, we take the first element [0]
        data[col] = data[col].fillna(data[col].mode()[0])
    print(f"Still missing values: {data.isnull().sum().sum()}")

    data["Opinion_Polarity"] = (data["Opinion"].apply(preprocess_text).apply(vader_polarity))
    data['Concert_City'] = data['Concert'].astype(str).str.extract(r'^([A-Za-z]+)') # concert city 
    data['Concert_StartTime'] = data['Concert'].astype(str).str.extract(r'([A-Za-z]+)(\d+)-', expand=False)[1].astype(float) # Extract end time (the number after the dash, before 'pm')
    data['Concert_EndTime'] = data['Concert'].astype(str).str.extract(r'-(\d+)pm').astype(float)
    data['Concert_Duration'] = 12+data['Concert_EndTime'] - data['Concert_StartTime']
    data = data.drop(columns=['Concert'])
    
    # Apply function and create two new columns
    data[['Ticket_Type', 'Ticket_Num', 'Ticket_Class']] = data['TicketInfo'].apply(
        lambda x: pd.Series(split_ticket(x))
    )

    # Create dummy variables (One-Hot Encoding)
    # drop_first=True avoids multicollinearity (e.g., if not Male, then Female)
    still_not_number_cols = ['PreferedAlbum', 'Vinyl', 'VIP', 'Concert_City', 'Ticket_Type', 'Ticket_Class']
    df_encoded = pd.get_dummies(data, columns=still_not_number_cols, drop_first=True)
    df_final = df_encoded.drop(columns=['Opinion', 'TicketInfo']) # drop processed columns

    if save:
        df_final.to_csv(filename, index=False)

    return df_final

In [None]:
data=preprocess_csv(path+'/train.csv', save=False, filename=path+'/train_cleaned.csv')
data.columns

In [None]:
data_test = preprocess_csv(path+"/test.csv", save=True, filename=path+"/test_cleaned.csv")
data_test.head()

In [None]:
corr = pd.DataFrame.corr(data)
corr_graph=sns.heatmap(corr, vmin=-1,vmax=1,cmap='viridis')
corr_graph.plot()
opinion_data = raw_data['Opinion']
opinion_data["cleanOpinion"] = raw_data['Opinion'].apply(preprocess_text)
sia = SentimentIntensityAnalyzer()

opinion_data['sentiment'] = opinion_data['cleanOpinion'].apply(
    lambda x: sia.polarity_scores(x)['compound']
)
opinion_data['FreePass']=raw_data["FreePass"].astype(int)
opinion_data['Age']=raw_data["Age"].astype(float)


In [None]:
extra=sns.histplot(data=opinion_data, x='sentiment',y="Age", hue='FreePass', kde=True,element='step',stat='count', bins=33)
extra.plot()

In [None]:
data['logDrinks']=np.log1p(data['Drinks'])
data['logFood']=np.log1p(data['Food'])

data_test['logDrinks']=np.log1p(data_test['Drinks'])
data_test['logFood']=np.log1p(data_test['Food'])


In [None]:
data['logFoodRate']=data["Food"]/data['AvgTime']
data['logDrinkRate']=data["logDrinks"]/data['AvgTime']

In [None]:
rates = sns.histplot(data, x='logFoodRate', hue='FreePass')
rates.set_xscale('log')
#jnt_fooddrink = sns.JointGrid(data=data, x='Age', y='logFoodRate', hue='FreePass')
#jnt_fooddrink.plot(sns.histplot, sns.histplot)

In [None]:
data.columns

In [None]:
# Prepare training data -> split training set into (70% / 15% / 15%) #
X=data.drop(columns=["Id","FreePass", "AvgTime",'Concert_City_NYC', 'Concert_City_SF','Concert_StartTime', 'Concert_EndTime', 'Concert_Duration'])
Y=data["FreePass"]
IDs=data["Id"]

data_test_IDs = data_test["Id"]
X_test_ext = data_test.drop(columns=["Id","AvgTime",'Concert_City_NYC', 'Concert_City_SF','Concert_StartTime', 'Concert_EndTime', 'Concert_Duration'])

randstate=np.random.randint(0,1000)
randstate2=np.random.randint(0,1000)
print(f"{randstate} {randstate2}")

# 100 -> 70/30 #
X_train, X_hold, Y_train, Y_hold, ID_train, ID_hold = train_test_split(
    X,Y,IDs,
    test_size=0.3,
    random_state=randstate,
    stratify=Y
)

# 30 -> 15/15 #
X_val, X_test, Y_val, Y_test, ID_val, ID_test = train_test_split(
    X_hold,Y_hold,ID_hold,
    test_size=0.5,
    random_state=randstate2,
    stratify=Y_hold
)

In [None]:
## LINEAR REGRESSION PIPELINE -- NOT GOOD FOR THIS PROBLEM ##
regression_pipeline = Pipeline(
    steps=[
    ("scalar", StandardScaler(with_mean=False)),
    ("regression", LinearRegression())
    ]
)
# Train #
regression_pipeline.fit(X_train,Y_train)

# Predict #
train_pred = regression_pipeline.predict(X_train)
val_pred = regression_pipeline.predict(X_val)
test_pred = regression_pipeline.predict(X_test)


model_output = regression_pipeline.predict(X_test_ext)
model_freepass = (model_output >= 0.5)

metrics = {
    "train_mse": mean_squared_error(Y_train, train_pred),
    "val_mse":   mean_squared_error(Y_val, val_pred),
    "test_mse":  mean_squared_error(Y_test, test_pred),
    "train_auc": roc_auc_score(Y_train, train_pred),
    "val_auc":   roc_auc_score(Y_val, val_pred),
    "test_auc":  roc_auc_score(Y_test, test_pred),
}

output = (
    data_test_IDs
    .to_frame(name="Id")
    .assign(FreePass_pred=model_freepass)
)
#output.to_csv(os.getcwd()+"/results/linear_regression_jan12.csv", index=False)

metrics

In [None]:
logistic_l1 = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        max_iter=1000,
        class_weight="balanced"
    ))
])
dlog_l1 = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        max_iter=2000,
        class_weight="balanced"
    ))
])
elastic_log = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", LogisticRegression(
        penalty="elasticnet",
        solver="saga",
        l1_ratio=0.3,
        max_iter=4000,
        class_weight="balanced"
    ))
])

grad_boost_tree = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.04,
    max_depth=3,
    random_state=randstate
)

rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    min_samples_leaf=15,
    class_weight="balanced",
    random_state=randstate,
    n_jobs=-1
)

def train_model(model, x_train, y_train):
    model.fit(x_train, y_train)
    return model

def eval_model(model, x, y=None, threshold=0.5):
    score = model.predict_proba(x)[:,1]
    output = score >= threshold
    metrics=[]
    if y is not None:
        metrics.append(roc_auc_score(y,score))
        metrics.append(np.sum(y==output)/(len(y)))
        metrics.append(log_loss(y, score))
    return score, output, metrics

In [None]:
pipelines = [logistic_l1, grad_boost_tree, elastic_log, rf]
model_names = ["logistic_l1", "grad_boost_tree", "elastic_log", "rf", "dummy_model"]
models=[]

# Train models #
for ppl in pipelines:
    models.append(train_model(ppl, X_train, Y_train))
dummy_answers = Y_train.sample(frac=1.0, random_state=55).values
dummy_model = train_model(dlog_l1, X_train, dummy_answers)
models.append(dummy_model)

thresholds=np.linspace(0.2, 0.8, 40)

# Evaluate with validation split
metrics_all = []
for _,m in enumerate(models):
    metrics_model=[]
    print(f"----------- ({str(model_names[_])})")
    for t in thresholds:
        scores,output,metrics = eval_model(m, X_test, Y_test, t)
        metrics_model.append(metrics)
    metrics_all.append(metrics_model)


In [None]:
plot_metrics = []
for _m, m in enumerate(metrics_all):
    acc=[]
    roc=[]
    logloss=[]
    for _t, t in enumerate(metrics_all[_m]):
        #print(t)
        roc.append(t[0])
        acc.append(t[1])
        logloss.append(t[2])
    roc=np.asarray(roc, dtype=float)
    acc=np.asarray(acc, dtype=float)
    logloss=np.asarray(logloss, dtype=float)
    plot_metrics.append([roc, acc, logloss])
plot_metrics=np.asarray(plot_metrics)

for _, mdl in enumerate(plot_metrics):
    print(f"{np.max(plot_metrics[_,1,:]):.4f} -- {model_names[_]}")


In [None]:
plot_metrics_names = ["roc_auc", "acc", "logloss"]

plt.figure(figsize=(10, 6))
colors=['black', 'blue', 'green', 'red', 'orange']
markers = [".","x","^"]
# Loop over models
for i, model in enumerate(plot_metrics):
    # Select only this model's rows
    
    # Plot each metric for this model
    for n, mtrc in enumerate(plot_metrics_names):
        plt.plot(
            thresholds,
            plot_metrics[i][n],
            marker=markers[n],
            color=colors[i],
            label=f"{model_names[i]} - {mtrc}"
        )

plt.title("Model Performance vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("Metric value")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
## To Save CSV [NEEDS FIXING -> Modular for each model]

output= model_logistic_l1.predict(X_test_ext)
output2= model_grad_boost_tree.predict(X_test_ext)
output_pred = output >= 0.5
output_pred2 = output2 >= 0.5
output_df = (
    data_test_IDs
    .to_frame(name="Id")
    .assign(FreePass=output_pred)
)
output_df

output_df.to_csv(os.getcwd()+"/results/logistic_regression-Alex_Peter-JAN12.csv", index=False)
output_df2 = (
    data_test_IDs
    .to_frame(name="Id")
    .assign(FreePass=output_pred2)
)
output_df2

output_df2.to_csv(os.getcwd()+"/results/gradient_boosted_tree-Alex_Peter-JAN12.csv", index=False)

In [None]:
logl1_coef = pd.Series(logistic_l1.named_steps["clf"].coef_[0], index=X_train.columns)
l_sorted=logl1_coef.sort_values(ascending=False)
l_sorted

gradboosttree_coef = pd.Series(grad_boost_tree.feature_importances_, index=X_train.columns)
gbt=gradboosttree_coef.sort_values(ascending=False)
gbt

leak_check = X_train.assign(FreePass=Y_train)

corr = leak_check.corr()["FreePass"].abs().sort_values(ascending=False)
corr.head(10)
