# Notebook creates a model to be used for scoring later in a Fastapi app

In [30]:
## getting a list of all files in the directory with their respective pathh
import glob
import pandas as pd
 
def list_files_glob(pattern='./**/*', recursive=True):
    lst = []
    files = glob.glob(pattern, recursive=recursive)
    for file in files:
        lst.append(file)
    return lst
lst = list_files_glob()

In [32]:
## my filepath cleaner. This doesnt read the files, only cleans the path and labels.
def cleaner(lst):
    label,filename,filepath = [],[],[] 
    for i in lst:
        temp = i.split('/')
        if len(temp)>2:
            label.append(temp[1])
            filename.append(temp[2])
            filepath.append(i)
    return label, filename,filepath
labels, filenames, filepaths = cleaner(lst)

In [100]:
#just a test
filenames[0:3]

['technologie_82.txt', 'technologie_96.txt', 'technologie_41.txt']

In [101]:
#just a test
filepaths[0:3]

['./technologie/technologie_82.txt',
 './technologie/technologie_96.txt',
 './technologie/technologie_41.txt']

In [102]:
#just a test
labels[0:3]

['technologie', 'technologie', 'technologie']

In [25]:
## my file reader function
def read_all_files(filepath):
    texts = []
    for file in filepath:
        f = open(file, "r")
        texts.append(f.read())
    return texts

texts = read_all_files(filepath)

In [99]:
# making sure they are the same length
print (len(texts),len(filenames),len(filepath))

1006 1006 1006


In [43]:
#placing everything in a pandas dataframe temporarily so i can clean it easier

data = pd.DataFrame(data = {'filenames':filenames,'labels':labels, 'texts':texts} )

In [44]:
data

Unnamed: 0,filenames,labels,texts
0,technologie_82.txt,technologie,Games firms 'face tough future'\n\nUK video ga...
1,technologie_96.txt,technologie,California sets fines for spyware\n\nThe maker...
2,technologie_41.txt,technologie,T-Mobile bets on 'pocket office'\n\nT-Mobile h...
3,technologie_55.txt,technologie,OnePlus 8 full specs comparison chart: 8 vs. 8...
4,technologie_69.txt,technologie,'Friends fear' with lost mobiles\n\nPeople are...
...,...,...,...
1001,graphics_38.txt,graphics,In <1pscti$aqe@travis.csd.harris.com> srp@trav...
1002,graphics_10.txt,graphics,Hello everybody !\nIf you are using PIXAR'S Re...
1003,graphics_100.txt,graphics,I need help in creating my 4x4 perspective mat...
1004,graphics_11.txt,graphics,"In article <1pp991$t63@cc.tut.fi>, jk87377@leh..."


In [61]:
## This is my main preprocessor. it cleans the textual input in the files and creates my x
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from string import punctuation
def preprocess_text(text):
    """ Apply any preprocessing methods"""
    text = text.lower()
    text = ''.join(c for c in text if c not in punctuation)
    return text
# Data preprocessing
data['texts'] = data['texts'].apply(preprocess_text)

# Split data into training and testing sets
X = data['texts']
y = data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [73]:
label_encoder.classes_[0]

'business'

In [48]:
from sklearn.naive_bayes import MultinomialNB

# Example with Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

In [56]:
## Saving all i need as pickle
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(model,f)

with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)
    
with open('label_encoder.pkl','wb') as f:
    pickle.dump(label_encoder.classes_,f)
    

array([ 3,  5,  5,  4,  9,  5,  5,  4,  9, 10,  5,  9,  2,  7,  7,  5,  5,
        3,  0,  1,  3,  4,  0,  0,  4,  0,  9,  0,  9,  8,  2,  2, 10,  7,
        2, 10,  8,  0,  0,  1,  7,  4,  9,  3,  9,  0,  9,  3,  8,  8,  5,
        5, 10,  5,  4,  2,  9, 10, 10, 10,  8,  7, 10,  8,  7,  7,  7,  1,
        7,  2,  1,  0,  7,  1,  5,  4,  7,  3, 10,  4,  7,  9, 10,  0, 10,
        3,  5,  8,  0,  5,  4,  8,  3,  7,  2,  9,  1,  9, 10, 10,  3,  4,
       10,  4,  4,  0,  2,  8,  3,  3,  2,  8,  5,  3,  9, 10,  2,  4,  9,
        4,  1,  8,  7,  5,  7,  7,  1,  8,  7,  1,  2,  5,  8,  7,  3,  4,
       10,  1,  4, 10, 10, 10,  0,  9,  3,  8,  2,  1,  8,  9,  9,  3,  8,
        0,  4,  3,  3,  8,  0, 10,  9,  4,  4,  4,  5,  2,  4,  4,  5,  0,
        9,  7,  7,  0,  1,  7,  2,  4, 10,  0,  4,  9,  7,  5,  7,  7,  2,
        1,  9,  3,  0,  2,  4,  4,  9,  4,  0,  2,  9,  0,  9, 10])

In [83]:
## one sample example of the output probabilities for a test instance:

pd.DataFrame({'label':label_encoder.classes_,'probability':model.predict_proba(X_test[0])[0]}).sort_values('probability',ascending =False)

Unnamed: 0,label,probability
3,graphics,0.53804
10,technologie,0.102733
5,medical,0.100252
8,space,0.060171
2,food,0.042057
7,politics,0.04032
1,entertainment,0.035636
9,sport,0.027876
0,business,0.027727
4,historical,0.024522


## All the metrics on test files

In [49]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.9356435643564357
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        26
           1       1.00      1.00      1.00        13
           2       1.00      0.85      0.92        20
           3       1.00      1.00      1.00        18
           4       1.00      1.00      1.00        27
           5       0.68      1.00      0.81        13
           7       0.79      1.00      0.88        19
           8       1.00      0.77      0.87        22
           9       1.00      1.00      1.00        24
          10       0.91      1.00      0.95        20

    accuracy                           0.94       202
   macro avg       0.94      0.94      0.93       202
weighted avg       0.95      0.94      0.94       202

