In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.multiclass import OneVsRestClasssifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


In [None]:
!wget https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip

--2023-07-01 06:34:15--  https://www.dropbox.com/s/5721wcs2guuykzl/stacksample.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/5721wcs2guuykzl/stacksample.zip [following]
--2023-07-01 06:34:15--  https://www.dropbox.com/s/raw/5721wcs2guuykzl/stacksample.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1c5d03509f38a6710d5cde02e7.dl.dropboxusercontent.com/cd/0/inline/B_BtQDiuO0D52FulxMZRcBo0Ky_zF1DxHhmajdVWH1xhMvmt6JHdM-JZe1u2uVAM2I45NHx6ODIErmjj6RqvCVQW-mx_hEzyamlq3lDZgMJrXcqxPfktACAEJRS5hdZ09pk5qNOUDBgYf5UG-MjEV-FDgj7VN-Tke8YyKcD6fqRt4A/file# [following]
--2023-07-01 06:34:16--  https://uc1c5d03509f38a6710d5cde02e7.dl.dropboxusercontent.com/cd/0/inline/B_BtQDiuO0D52FulxMZRcBo0Ky_zF1DxHhmajdVWH1xhMvmt6JHdM-JZe1u2uVAM2I45NHx

In [None]:
!unzip stacksample

Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [None]:
data = pd.read_csv('Questions.csv', encoding='ISO-8859-1')
tags_df = pd.read_csv('Tags.csv', encoding='ISO-8859-1')
answers_df = pd.read_csv('Answers.csv', encoding='ISO-8859-1')

In [None]:
data_tags = data.merge(tags_df, how='inner',left_on='Id',right_on='Id')

In [None]:
top_tags = tags_df['Tag'].value_counts().nlargest(10).index.tolist()

In [None]:
top_tags

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios']

In [None]:
filtered_data = data_tags[data_tags['Tag'].isin(top_tags)]

In [None]:
#  Preprocess the text data
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<[^>]+>', '', text)
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = re.sub('\s+', ' ', text).strip()
    return text


In [None]:

filtered_data['Title'] = filtered_data['Title'].apply(preprocess_text)
filtered_data['Body'] = filtered_data['Body'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Title'] = filtered_data['Title'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Body'] = filtered_data['Body'].apply(preprocess_text)


In [None]:
# Split the data into train and validation sets
X = filtered_data['Title'] + ' ' + filtered_data['Body']
y = filtered_data['Tag']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
# Transform tags into binary matrix representation
mlb = MultiLabelBinarizer()
y_train_bin = mlb.fit_transform(y_train.apply(lambda x: [x]))
y_val_bin = mlb.transform(y_val.apply(lambda x: [x]))

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
# Build the model
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(X_train_vec, y_train_bin)

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from keras.models import Sequential
from keras.layers import LSTM,Dense,Embedding
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
# Make predictions on the validation set
y_pred = classifier.predict(X_val_vec)
y_pred_labels = mlb.inverse_transform(y_pred)

In [None]:
# Evaluate the model
print(classification_report(y_val_bin, y_pred, target_names=top_tags))


              precision    recall  f1-score   support

  javascript       0.83      0.74      0.79     18101
        java       0.90      0.74      0.81     20459
          c#       0.91      0.73      0.81      9342
         php       0.67      0.23      0.34     11668
     android       0.94      0.83      0.88      9401
      jquery       0.89      0.69      0.78     22817
      python       0.70      0.43      0.53     24900
        html       0.59      0.29      0.39     15796
         c++       0.84      0.72      0.78     19778
         ios       0.95      0.85      0.90     13086

   micro avg       0.84      0.62      0.71    165348
   macro avg       0.82      0.63      0.70    165348
weighted avg       0.81      0.62      0.70    165348
 samples avg       0.61      0.62      0.61    165348



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

from sklearn.metrics import accuracy_score

In [None]:
y_pred = classifier.predict(X_val_vec)
y_pred_labels = mlb.inverse_transform(y_pred)
y_val_bin = mlb.transform(y_val.apply(lambda x: [x]))

In [None]:
accuracy = accuracy_score(y_val_bin,y_pred)

In [None]:
accuracy

0.6041863221810968

In [None]:
# Preprocess input text
input_text = preprocess_text(" Are there any really good tutorials explain of python.")

# Vectorize input text
input_vec = vectorizer.transform([input_text])

# Make predictions
pred_vec = classifier.predict(input_vec)

# Convert predictions to tag labels
pred_labels = mlb.inverse_transform(pred_vec)

# Print predicted tags
print("Predicted Tags:", pred_labels)

Predicted Tags: [('python',)]
