In [5]:
pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Importing packages and data
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier,
                              HistGradientBoostingClassifier)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt



In [None]:
# Load the data
df = pd.read_csv('Language Detection.csv')

# Preprocessing our data
tfidf = TfidfVectorizer(max_features=2000)
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Language']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Setting up our results dataframe
df_results = pd.DataFrame(columns=['accuracy', 'run_time'])

# Setting Up Our Candidate Models
models = [
    DecisionTreeClassifier(max_depth=12, random_state=1234),
    RandomForestClassifier(n_estimators=500, max_features=0.06, n_jobs=6, random_state=1234),
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, max_features=0.06),
                       n_estimators=500, learning_rate=0.5, random_state=1234),
    GradientBoostingClassifier(n_estimators=2000, subsample=0.67, max_features=0.06,
                                validation_fraction=0.1, n_iter_no_change=15, verbose=0, random_state=1234),
    HistGradientBoostingClassifier(max_iter=2000, validation_fraction=0.1, n_iter_no_change=15,
                                    verbose=0, random_state=1234),
    XGBClassifier(n_estimators=2000, tree_method='hist', subsample=0.67, colsample_level=0.06,
                  verbose=0, n_jobs=6, random_state=1234),
    LGBMClassifier(n_estimators=2000, feature_fraction=0.06, bagging_fraction=0.67,
                   bagging_freq=1, verbose=0, n_jobs=6, random_state=1234),
    CatBoostClassifier(n_estimators=2000, colsample_bylevel=0.06, max_leaves=31,
                       subsample=0.67, verbose=0, thread_count=6, random_state=1234),
]

model_names = [model.__class__.__name__ for model in models]

es_models = ['XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier']

# Training and Evaluation
for model, name in zip(models, model_names):
    start_time = time()
    if name in es_models:
        model.fit(X_train, y_train)
    else:
        model.fit(X_train, y_train)
    run_time = time() - start_time
    accuracy = model.score(X_test, y_test)
    df_results.loc[name] = [accuracy, run_time]

# Visualizing the Results
plt.figure(figsize=(10, 6))
df_results['accuracy'].plot(kind='bar', color='skyblue')
plt.title('Test Accuracy of Different Models')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
df_results['run_time'].plot(kind='bar', color='lightgreen')
plt.title('Run Time of Different Models')
plt.ylabel('Run Time (seconds)')
plt.xlabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# User interaction for prediction
while True:
    user_input = input("Enter text to predict its language (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    else:
        text_vectorized = tfidf.transform([user_input]).toarray()
        predictions = {}
        for model, name in zip(models, model_names):
            predictions[name] = model.predict(text_vectorized)[0]
        print("Predictions:")
        for model_name, prediction in predictions.items():
            print(f"{model_name}: {prediction}")


