In [1]:
#combine all files into one df
import pandas as pd
import os
import warnings 
warnings.filterwarnings('ignore')

folder = 'youtube-spam-collection-v1'

def load_files_to_df(folder):
    #store inidividual dfs
    dfs = []

    for filename in os.listdir(folder):
        if filename.endswith('csv'):
            file_path = os.path.join(folder, filename)

            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

df = load_files_to_df(folder)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def feature_process(x):
    X = x.CONTENT
    y = x.CLASS
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size = 0.25, random_state=42)
    cv = CountVectorizer()
    #turn comments into vectors
    Xtrain = cv.fit_transform(Xtrain)
    Xtest = cv.transform(Xtest)
 
    return Xtrain, Xtest, ytrain, ytest, cv

#apply feature process to df

Xtrain, Xtest, ytrain, ytest, cv = feature_process(df)
#build classification models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

lr = LogisticRegression()
rf = RandomForestClassifier(random_state=42)
svm = SVC(kernel='linear')

estimators = ([
    ('lr',lr),
    ('rf',rf),
    ('svm',svm)
])
vc = VotingClassifier(estimators = estimators, voting= 'hard')
vc.fit(Xtrain,ytrain)

#cross validate and hypertune
from sklearn.model_selection import RandomizedSearchCV

parameter_grid = dict(
    lr__penalty = ['l1','l2', 'elasticnet',None],
    lr__solver = ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
    lr__C = [0.1,0.5,1,2],
    lr__max_iter = [100,500,1000],
    rf__n_estimators = [50,100,200],
    rf__criterion = ['gini','entropy', 'log_loss'],
    rf__max_features = ['sqrt','log2',None],
    svm__C = [0.5,1,2],
    svm__kernel = ['linear','rbf','poly'],
    svm__gamma = [1,5,10]
)


search = RandomizedSearchCV(vc, parameter_grid, random_state =42, cv=5,
                            scoring = 'accuracy')
search.fit(Xtrain,ytrain)


In [2]:
import joblib

#save CountVectorizer and VotingClassifier

joblib.dump(cv, 'vectorizer.pkl')
joblib.dump(search.best_estimator_, 'model.pkl')

['model.pkl']

In [3]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import joblib  # For loading the model

# Load your pre-trained model and vectorizer
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Initialize the Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Spam Detector"),
    dcc.Textarea(
        id='comment-input',
        value='',
        style={'width': '100%', 'height': 100}
    ),
    html.Button('Check Comment', id='submit-button', n_clicks=0),
    html.Div(id='result-output')
])

@app.callback(
    Output('result-output', 'children'),
    Input('submit-button', 'n_clicks'),
    Input('comment-input', 'value')
)
def update_output(n_clicks, comment):
    if n_clicks > 0:
        # Preprocess the comment
        X = cv.transform([comment])
        prediction = model.predict(X)[0]
        return f'Result: {"Spam" if prediction == 1 else "Not Spam"}'
    return 'Enter a comment and click "Check Comment"'

if __name__ == '__main__':
    app.run_server(debug=True, port=8050)