<a href="https://colab.research.google.com/github/murali-marimekala/ml_text_classification/blob/main/notebooks/ml_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
#Code to predict the text
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
from google.colab import files
import zipfile

In [83]:
def load_data(folder_path):
  data = []
  labels = []
  for root, _, files in os.walk(folder_path):
    for file_name in files:
      if file_name.endswith(".txt"):
        file_path = os.path.join(root, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
          content = file.read()
          label = file_name.split("_")[0]
          data.append(content)
          labels.append(label)
  return pd.DataFrame({"text": data, "label": labels})

In [84]:
# Step 2: Train a model
def train_model(dataframe):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(dataframe['text'])
    y = dataframe['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, predictions))

    return model, vectorizer

In [85]:
# Step 3: Save the model and vectorizer
def save_model(model, vectorizer, model_path, vectorizer_path):
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)

In [86]:
def predict_new(zip_file_path, model, vectorizer):
    """
    Predicts the labels of text files within a zip file using a trained model.

    Args:
        zip_file_path (str): Path to the zip file containing text files.
        model: Trained classification model.
        vectorizer: Trained TF-IDF vectorizer.
    """

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file_name in zip_ref.namelist():  # Iterate through files in the zip
            if file_name.endswith(".txt"):
                with zip_ref.open(file_name) as file:
                    content = file.read().decode('utf-8')  # Read content and decode
                    vectorized_content = vectorizer.transform([content])
                    prediction = model.predict(vectorized_content)
                    print(f"File: {file_name}, Predicted Label: {prediction[0]}")

In [87]:
if __name__ == "__main__":
    # Check if the file already exists and delete it
    if os.path.exists("training_data.zip"):  # Replace with your actual file name
        os.remove("training_data.zip")

    # Open a file dialog to select the folder
    uploaded = files.upload()  # This will prompt the user to upload files
    zip_file_name = list(uploaded.keys())[0]  # Assuming only one file is uploaded
    if not zip_file_name:
        print("No file selected. Exiting.")
        exit()

    # Extract the contents of the ZIP file
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall()
    data_folder = zip_file_name[:-4]  # Remove '.zip' extension to get folder name

    model_path = "text_classifier_model.pkl"
    vectorizer_path = "text_vectorizer.pkl"

    # Load and preprocess data
    df = load_data(data_folder)
    #print(os.listdir(data_folder))

    # Train the model
    model, vectorizer = train_model(df)

    # Save the model and vectorizer
    save_model(model, vectorizer, model_path, vectorizer_path)

    # Step 4: Predict on new data
    print("Upload new data files for prediction")
    # Check if the file already exists and delete it
    if os.path.exists("predict_data.zip"):  # Replace with your actual predict file name
        os.remove("predict_data.zip")
    prediction_files = files.upload()  # Prompt to upload new files for prediction
    predict_folder = list(prediction_files.keys())[0]
    predict_new(predict_folder, model, vectorizer)

Saving training_data.zip to training_data.zip
Classification Report:
               precision    recall  f1-score   support

        news       0.00      0.00      0.00       0.0
      sports       0.00      0.00      0.00       1.0
        tech       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Upload new data files for prediction


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving predict_data.zip to predict_data.zip
File: predict_data/file1.txt, Predicted Label: news
File: predict_data/file2.txt, Predicted Label: news
