## Model Fine Tuning: XGBoost and SVM!

1. Prep of the data

In [1]:
import pandas as pd
resumen_alcance3=pd.read_csv('deepldataset2.csv')
resumen_alcance3=resumen_alcance3.drop(['Unnamed: 0'], axis=1)
resumen_alcance3=resumen_alcance3.drop_duplicates()
resumen_alcance3.shape
dataset_cleaned_eng = pd.read_csv("data_cleaned2.csv")
dataset_cleaned_eng.shape

(1358, 54)

In [2]:
dataset_cleaned_eng['Resumen_proyecto_eng']=resumen_alcance3['Resumen proyecto']
dataset_cleaned_eng.shape

(1358, 55)

In [4]:
dataset_noBig= pd.read_csv('data_cleaned3_noBig.csv')
dataset_noBig=dataset_noBig.drop(['Unnamed: 0'], axis=1)
dataset_noBig=dataset_noBig.drop_duplicates()

In [5]:
df_merged=pd.merge(dataset_noBig, dataset_cleaned_eng, on=['Fecha inicio', 'Agrupación IT Demanda', 'Gestor Demanda', 'Unidad IT',
       'Categoría', 'IT Manager', 'Prioridad', 'Business Owner',
       'Agrupación IT de Ejecución', 'Objetivo', 'Informacional',
       'Área Peticionaria', 'Resumen proyecto', 'Alcance', 'Horas'], how = 'inner')

In [7]:
# Identify the desired columns
selected_columns = ['Resumen_proyecto_eng', 'Horas', 'Fecha inicio']  # Replace with the actual column names

# Create a new dataset with selected columns
dataset_final = pd.DataFrame()

# Copy selected columns
for column in selected_columns:
    dataset_final[column] = df_merged[column]

In [8]:
#Split the hours in bins

bins = [0, 300, 800, 2000, 47000]  
labels = ['0-300', '301-800','801-2000','+2000']  
target_categories = pd.cut(dataset_final['Horas'], bins=bins, labels=labels)


dataset_final['Horas Categories'] = target_categories
dataset_final = dataset_final.drop(columns = ["Horas"])
dataset_final['Horas Categories'].value_counts()

Horas Categories
301-800     372
801-2000    338
0-300       320
+2000       258
Name: count, dtype: int64

## Text preprocessing

In [9]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

# Download required NLTK resources
nltk.download("punkt")  # For Stemming
nltk.download("wordnet")  # For Lemmatization
nltk.download("stopwords")  # For Stopword Removal
nltk.download("omw-1.4")

# Import module
from nltk.tokenize import RegexpTokenizer

# Create an instance of RegexpTokenizer for alphanumeric tokens
preprocessed = []
tokeniser = RegexpTokenizer(r'[a-zA-Z0-9]+')
lemmatiser = WordNetLemmatizer()

for resumen in dataset_final['Resumen_proyecto_eng']:
    try:
        tokens = tokeniser.tokenize(resumen)
        lemmas = [lemmatiser.lemmatize(word.lower(), pos='v') for word in tokens]
        key_words = [word for word in lemmas if word not in stopwords.words('english') and len(word) > 2 and not any(char.isdigit() for char in word)]

        
        # Extract bigrams
        #bigram_measures = BigramAssocMeasures()
        #finder = BigramCollocationFinder.from_words(key_words)
        #bigrams = finder.nbest(bigram_measures.pmi, 15)  # Adjust the number of top bigrams as desired
        
        preprocessed.append(key_words)
    except:
        preprocessed.append(None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nathanielthomascopeland/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nathanielthomascopeland/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nathanielthomascopeland/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nathanielthomascopeland/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
#double relevance
from sklearn.feature_extraction.text import TfidfVectorizer
# Step 1: Combine all words into a single list
all_words = [word for sublist in preprocessed for word in sublist]

# Step 2: Calculate TF-IDF scores
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([" ".join(all_words)])

# Step 3: Define threshold and filter words
threshold = 0.02 # Adjust the threshold as desired
filtered_words = [word for word, score in zip(all_words, X.toarray()[0]) if score > threshold]

# Step 4: Filter the preprocessed list using the filtered words
filtered_preprocessed = []
for sublist in preprocessed:
    filtered_sublist = [word for word in sublist if word in filtered_words]
    filtered_preprocessed.append(filtered_sublist)


## Model training: filtered

In [13]:
#1 Bag of Word Modelling

from sklearn.preprocessing import MultiLabelBinarizer

count_vec = MultiLabelBinarizer()
mlb = count_vec.fit(filtered_preprocessed)
df_filtered = pd.DataFrame(mlb.transform(filtered_preprocessed), columns=[mlb.classes_])

df_filtered.to_csv('preprocessedbadofwordsresumen2_filtered.csv')
df=pd.read_csv('preprocessedbadofwordsresumen2_filtered.csv')

df['Fecha inicio'] = pd.to_datetime(dataset_final['Fecha inicio'])
df['Horas_bins']=dataset_final['Horas Categories']
df=df.dropna(axis=0, how='any', subset=None, inplace=False)
df=df.drop(columns=['Unnamed: 0'])
df.shape


(1288, 230)

In [14]:
#XGBoostCategories
categories_num=[]
for x in df["Horas_bins"]:
    if x=='0-300':
        categories_num.append(0)
    elif x=='301-800':
        categories_num.append(1)
    elif x=='801-2000':
        categories_num.append(2)
    elif x=='+2000':
        categories_num.append(3)
    else:
        categories_num.append(None)
        
df["Category numbers"]=categories_num

In [15]:
split_date= pd.to_datetime("2022-04-01")
df_before=df[df['Fecha inicio']<split_date]
df_after=df[df['Fecha inicio']>=split_date]

In [16]:
#Doing all the splits
y_after=df_after["Category numbers"]
X_after=df_after.drop(columns=["Category numbers","Horas_bins", "Fecha inicio"])
y=df_before["Category numbers"]
X=df_before.drop(columns=["Category numbers","Horas_bins", "Fecha inicio"])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [17]:
#Gridsearch SVC Model
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
svm_model = SVC()

param_grid = {
    'pca__n_components': [0.8, 0.9, 0.95],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto'],
    'svm__shrinking': [True, False],
    'svm__probability': [True, False]
}

steps = [('pca', pca), ('svm', svm_model)]
pipesvm = Pipeline(steps)

grid_search = GridSearchCV(pipesvm, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = pipesvm.set_params(**best_params)
best_model.fit(X_train, y_train)

test_score = best_model.score(X_test, y_test)
After_score = best_model.score(X_after, y_after)

print("Best Hyperparameters:", best_params)
print("Test Set Score:", test_score)

Best Hyperparameters: {'pca__n_components': 0.9, 'svm__C': 1, 'svm__gamma': 'scale', 'svm__kernel': 'rbf', 'svm__probability': True, 'svm__shrinking': True}
Test Set Score: 0.42
