In [9]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd
import ast
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.decomposition import TruncatedSVD

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# 📂 Step 3: Load your CSV file (upload manually or mount Drive)
df = pd.read_csv("proper_df.csv")

# 🧹 Step 4: Combine introduction and conclusion
df["combined_text"] = df["introduction_cleaned"] + " " + df["conclusion_cleaned"]

# 🔧 Step 5: Preprocessing
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # remove punctuation/numbers
    tokens = [t for t in tokens if t not in stop_words]
    lemmatized = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(lemmatized)

df["clean_text"] = df["combined_text"].apply(preprocess)

# 🎯 Step 6: Labels — convert stringified lists to actual lists
df["labels"] = df["model_family_vector"].apply(ast.literal_eval)

# 🔎 Step 7: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["clean_text"])

# 🎯 Step 8: Get labels
y = df["labels"].tolist()

svd = TruncatedSVD(n_components=400, random_state=42)
X_reduced = svd.fit_transform(X)

# 🧪 Step 9: Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42
)

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
print("TF-IDF shape:", X_test.shape)

NameError: name 'X_test' is not defined

In [None]:
import matplotlib.pyplot as plt

explained = svd.explained_variance_ratio_
plt.plot(range(1, len(explained)+1), explained.cumsum())
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Explained Variance by TruncatedSVD")
plt.grid(True)
plt.show()

NameError: name 'svd' is not defined

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss
import numpy as np

In [None]:
y_train_arr = np.array(y_train)
y_test_arr = np.array(y_test)

# Initialize the classifier
clf = OneVsRestClassifier(LogisticRegression(C=10.0, max_iter=1000, class_weight='balanced'))

# Train
clf.fit(X_train, y_train_arr)

# Predict
y_pred = clf.predict(X_test)
# Evaluate
print("Hamming Loss:", hamming_loss(y_test_arr, y_pred))
print("Exact Match Accuracy:", accuracy_score(y_test_arr, y_pred))  # Strict accuracy
print("\nDetailed classification report:")
print(classification_report(y_test_arr, y_pred))



Hamming Loss: 0.027389162561576353
Exact Match Accuracy: 0.32413793103448274

Detailed classification report:
              precision    recall  f1-score   support

           0       0.53      0.62      0.57        13
           1       0.43      0.23      0.30        13
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.75      0.50      0.60         6
          12       0.00      0.00      0.00         2
          13       0.50      1.00      0.67         2
          14       0.00      0.00      0.00         0
          15       1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
probs = clf.predict_proba(X_test)
probs
df = pd.DataFrame(probs)
# df = pd.DataFrame(y_test_arr)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.135752,0.033313,0.008193,0.006614,0.0,0.008832,0.016744,0.004168,0.014345,0.0,...,0.544877,0.004548,0.382311,0.033416,0.003244,0.012055,0.176468,0.005665,0.327419,0.003592
1,0.126278,0.135972,0.004331,0.004693,0.0,0.004522,0.024828,0.002322,0.006474,0.0,...,0.004290,0.003126,0.935018,0.009241,0.003807,0.002486,0.014775,0.002207,0.194494,0.003220
2,0.088133,0.063320,0.006604,0.006072,0.0,0.004891,0.012014,0.003679,0.008700,0.0,...,0.350163,0.004434,0.097025,0.033621,0.002917,0.007640,0.050306,0.009926,0.049777,0.003917
3,0.133445,0.325604,0.008652,0.009042,0.0,0.058012,0.020868,0.002997,0.009156,0.0,...,0.403647,0.003522,0.017607,0.011041,0.005231,0.009214,0.036150,0.002173,0.010482,0.005226
4,0.052589,0.019559,0.008749,0.004495,0.0,0.007842,0.014203,0.003243,0.007236,0.0,...,0.935323,0.003453,0.103813,0.011365,0.004127,0.005813,0.036230,0.009793,0.069816,0.003548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,0.131286,0.169231,0.006109,0.007451,0.0,0.003424,0.036788,0.003076,0.003829,0.0,...,0.063901,0.003571,0.012016,0.004721,0.002907,0.003979,0.030436,0.002748,0.016319,0.004164
141,0.346138,0.099304,0.007405,0.009041,0.0,0.012243,0.054842,0.003135,0.017227,0.0,...,0.046462,0.004146,0.027554,0.009718,0.003351,0.007203,0.181282,0.016206,0.071714,0.004847
142,0.142710,0.188873,0.015243,0.021931,0.0,0.004802,0.086861,0.003846,0.009316,0.0,...,0.876177,0.004631,0.217748,0.069137,0.008873,0.021818,0.059505,0.001934,0.024191,0.005735
143,0.901903,0.016767,0.006816,0.010406,0.0,0.004574,0.009731,0.003442,0.019984,0.0,...,0.116875,0.006406,0.028767,0.030962,0.004099,0.016633,0.033132,0.003927,0.043308,0.004306
