In [None]:
import pandas as pd;
import sklearn;

from sklearn.model_selection import train_test_split;
from sklearn.pipeline import Pipeline;
from sklearn.svm import LinearSVC;
from sklearn.feature_extraction.text import TfidfVectorizer;
from sklearn.metrics import confusion_matrix, classification_report;

# Sentence Classification using SVM 

# Load the dataset from a TSV file (tab-separated values)
data_frame = pd.read_csv("data.tsv", sep="\t", encoding="utf-8");

#print(data_frame.head(10));

# Extract the sentence column (X) and labels (y)
X = data_frame["recenica"];
y = data_frame["labela"];

# Split the dataset into training (67%) and testing (33%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42);

# Define a classification pipeline:
# 1. Convert text data into TF-IDF vectors
# 2. Train a Support Vector Machine (SVM) model with a linear kernel
model = Pipeline([
    ("vectorizer", TfidfVectorizer()), 
    ("svc", LinearSVC())
]);

# Train the model using the training data
model.fit(X_train, y_train);

# Make predictions on the test data
predictions = model.predict(X_test);

# Generate a confusion matrix and display it as a DataFrame
conf_matrix = pd.DataFrame(confusion_matrix(y_test, predictions),
                           index=["emo", "info"], columns=["emo", "info"]);

print(conf_matrix);

# Print the classification report with precision, recall, and F1-score
print(classification_report(y_test, predictions));




              precision    recall  f1-score   support

         emo       0.50      0.33      0.40         3
        info       0.94      0.97      0.95        30

    accuracy                           0.91        33
   macro avg       0.72      0.65      0.68        33
weighted avg       0.90      0.91      0.90        33

