###Install Dependencies

In [1]:
!pip install torch torchvision torchaudio
!pip install scikit-learn
!pip install regex



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from transformers import BertTokenizer
import torch

### Import Dataset

In [3]:
df=pd.read_csv('/content/twitter_training.csv')

In [4]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
df.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [7]:
data=df.rename(columns={'2401': 'subID','Borderlands':'subject', 'Positive':'sentiment','im getting on borderlands and i will murder you all ,': 'comments'})


In [8]:
data.head()

Unnamed: 0,subID,subject,sentiment,comments
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [9]:
len(data)       # the dataset contains 74681 lines

74681

In [10]:
data['subject'].unique()

array(['Borderlands', 'CallOfDutyBlackopsColdWar', 'Amazon', 'Overwatch',
       'Xbox(Xseries)', 'NBA2K', 'Dota2', 'PlayStation5(PS5)',
       'WorldOfCraft', 'CS-GO', 'Google', 'AssassinsCreed', 'ApexLegends',
       'LeagueOfLegends', 'Fortnite', 'Microsoft', 'Hearthstone',
       'Battlefield', 'PlayerUnknownsBattlegrounds(PUBG)', 'Verizon',
       'HomeDepot', 'FIFA', 'RedDeadRedemption(RDR)', 'CallOfDuty',
       'TomClancysRainbowSix', 'Facebook', 'GrandTheftAuto(GTA)',
       'MaddenNFL', 'johnson&johnson', 'Cyberpunk2077',
       'TomClancysGhostRecon', 'Nvidia'], dtype=object)

In [11]:
data['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [12]:
data['comments'].unique()

array(['I am coming to the borders and I will kill you all,',
       'im getting on borderlands and i will kill you all,',
       'im coming on borderlands and i will murder you all,', ...,
       'Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice',
       'Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice',
       'Just like the windows partition of my Mac is like 6 years behind on its drivers So you have no idea how I didn’t notice'],
      dtype=object)

### Data Preprocessing

In [13]:
data.isnull().sum() # there are 686 missing values in the feature comments

subID          0
subject        0
sentiment      0
comments     686
dtype: int64

In [14]:
# Filter the DataFrame to show rows with missing values in the "comments" column
data[data['comments'].isnull()]



Unnamed: 0,subID,subject,sentiment,comments
60,2411,Borderlands,Neutral,
552,2496,Borderlands,Neutral,
588,2503,Borderlands,Neutral,
744,2532,Borderlands,Positive,
1104,2595,Borderlands,Positive,
...,...,...,...,...
73971,9073,Nvidia,Positive,
73972,9073,Nvidia,Positive,
74420,9154,Nvidia,Positive,
74421,9154,Nvidia,Positive,


In [15]:
data.dropna(subset=['comments'], inplace=True)  # remove missnig values

In [16]:
data.isnull().sum()

subID        0
subject      0
sentiment    0
comments     0
dtype: int64

In [17]:
# Perform random sampling to select 700 lines
data = data.sample(n=700, random_state=42)  # Adjust random_state for reproducibility

In [18]:
import regex as re

def clean_sentence(sentence):
    # Remove special characters and extra whitespaces
    cleaned_sentence = re.sub(r'[^\w\s]', '', sentence).strip()
    return cleaned_sentence

data['comments'] = data['comments'].apply(clean_sentence)

# remove rows where comments start with a comment indicator, such as '#'
data = data[~data['comments'].str.startswith('#')]

# remove rows where the cleaned comments are empty
data = data[data['comments'] != '']

# Save the cleaned DataFrame to a new file
data.to_csv('cleaned_data.csv', index=False)


In [19]:
data

Unnamed: 0,subID,subject,sentiment,comments
61734,4984,GrandTheftAuto(GTA),Irrelevant,Do you think you can hurt me
11260,13136,Xbox(Xseries),Positive,About The time
55969,11207,TomClancysRainbowSix,Neutral,Calls from _ z1rv _ Tweet98 got me this sore...
4111,1909,CallOfDutyBlackopsColdWar,Negative,So CoD Black Ops Cold War is gonna be ass Rea...
2308,1604,CallOfDutyBlackopsColdWar,Negative,Y HAPPY ABOUT THIS
...,...,...,...,...
53398,10774,RedDeadRedemption(RDR),Irrelevant,Game pass just keeps looking better
7484,9286,Overwatch,Neutral,NEW HERO ECHO Overwatch Hot Moments pt1077 Fu...
37535,5237,Hearthstone,Negative,Dont know if anyone else having this problem...
51833,10504,RedDeadRedemption(RDR),Positive,this is a excellent time to actually go try f...


###Feature Extraction

In [20]:
# Load pretrained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [21]:
# Preprocess the comments and convert them into BERT input format
encoded_data = tokenizer.batch_encode_plus(data['comments'].tolist(),
                                           padding='max_length',
                                           truncation=True,
                                           max_length=128,
                                           return_tensors='pt')

In [22]:
# Convert sentiment labels to numerical labels (e.g., Positive: 1, Negative: 0)
labels = data['sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).tolist()

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_data['input_ids'],
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)

###Initialize and train the SVM classifier

In [24]:
svm_classifier = SVC(kernel='linear')  # Linear kernel for linear SVM

In [None]:
svm_classifier.fit(X_train, y_train)

In [None]:
# Evaluate the classifier on the test set
accuracy = svm_classifier.score(X_test, y_test)