In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
#Load Data
df = pd.read_csv('prod_data.csv')

# replace number 
df['text'] = [re.sub(r'\d+', 'number',i) for i in df['text']]
df['text'] = [re.sub(r'%', 'percentage',i) for i in df['text']]
df['text'] = [re.sub(r'poly', 'poly poly',i) for i in df['text']]

In [3]:
#rbf
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['text'])
le = LabelEncoder()
y = le.fit_transform(df['verdict'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

classifier = SVC(kernel='rbf')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 93.00%


In [4]:
#testing code
input_text_vectorized = tfidf.transform(["Details: 100 % calfskin 75 % polyamide and 25 % elastane lining 100 % rubber sole"])
predicted_class_encoded = classifier.predict(input_text_vectorized)

predicted_class_label = le.inverse_transform(predicted_class_encoded)

print(f"Predicted Class: {predicted_class_label[0]}")

Predicted Class: 1


In [5]:
#linear
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['text'])

le = LabelEncoder()
y = le.fit_transform(df['verdict'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 87.00%


In [6]:
#testing code
input_text_vectorized = vectorizer.transform(["Details: 100 % calfskin 75 % polyamide and 25 % elastane lining 100 % rubber sole"])
predicted_class_encoded = classifier.predict(input_text_vectorized)

predicted_class_label = le.inverse_transform(predicted_class_encoded)

print(f"Predicted Class: {predicted_class_label[0]}")

Predicted Class: 1


In [7]:
#poly
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['text'])

le = LabelEncoder()
y = le.fit_transform(df['verdict'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)

classifier = SVC(kernel='poly')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 90.00%


In [8]:
label_encoder = LabelEncoder()
df['verdict'] = label_encoder.fit_transform(df['verdict'])

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['verdict'], test_size=0.1, random_state=0)

count_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_bow, y_train)

y_pred = svm_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.87


In [9]:
label_encoder = LabelEncoder()
df['verdict'] = label_encoder.fit_transform(df['verdict'])

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['verdict'], test_size=0.1, random_state=0)

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)

y_pred = svm_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.92
