In [9]:
from sklearn import preprocessing, metrics
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../../Dataset/Processed dataset/processed_data.csv')
train_df = pd.read_csv("../../Dataset/Processed dataset/train_data.csv")
test_df = pd.read_csv("../../Dataset/Processed dataset/test_data.csv")
validation_df = pd.read_csv("../../Dataset/Processed dataset/validation_data.csv")

In [39]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        # Make sure all text values are strings
        self.text = dataframe['sentence'].astype(str).values
        self.labels = dataframe['label'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.labels[idx]
        # Convert 'nan' to empty string if necessary
        if text == 'nan':
            text = ''
        encoding = self.tokenizer.encode(text)
        return encoding, label
    

In [41]:
# Load and preprocess data

tokenizer = tiktoken.get_encoding('gpt2')

# Create datasets with device
train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(validation_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

In [47]:
print(train_dataset)

<__main__.TextDataset object at 0x000002808C461E80>


In [62]:
train_df

Unnamed: 0,sentence,sentiment,label
0,feel submissive ever,sadness,4
1,feel playful enough try new combination,joy,2
2,find broken piece feeling nothing feeling noth...,anger,0
3,feel ecstatic worry make love automatic adica ...,joy,2
4,ive feeling really jealous friend rafia im ash...,anger,0
...,...,...,...
161608,feeling nervous,fear,1
161609,feel like punished believing austin,sadness,4
161610,look back little paragraph ive written feel bi...,anger,0
161611,feel inconvenienced trimmer blade dull,sadness,4


In [60]:
len(train_df['sentence'])

161613

In [50]:
features = []
labels = []

In [68]:
train_df

Unnamed: 0,sentence,sentiment,label
0,feel submissive ever,sadness,4
1,feel playful enough try new combination,joy,2
2,find broken piece feeling nothing feeling noth...,anger,0
3,feel ecstatic worry make love automatic adica ...,joy,2
4,ive feeling really jealous friend rafia im ash...,anger,0
...,...,...,...
161608,feeling nervous,fear,1
161609,feel like punished believing austin,sadness,4
161610,look back little paragraph ive written feel bi...,anger,0
161611,feel inconvenienced trimmer blade dull,sadness,4


In [70]:
for idx, row in train_df.iterrows():
    text = str(row['sentence'])
    # sentiment = row['sentiment']  # You can use this if needed
    label = row['label']
    
    # Encode the text
    tmp = tokenizer.encode(text)
    
    # Convert encodings to a fixed-size feature vector
    # You might need to choose an approach based on your tokenizer
    # For example, if using a simple tokenizer that returns a list of token IDs:
    if len(tmp) > 0:
        feature_vector = np.mean(tmp, axis=0) if isinstance(tmp[0], (list, np.ndarray)) else np.array(tmp)
    else:
        # Handle empty encodings
        feature_vector = np.zeros(100)  # Use appropriate dimensionality
        
    features.append(feature_vector)
    labels.append(label)

In [82]:
features[0]

array([36410,   850, 33532,  1683])

In [86]:
max_length = max(len(enc) for enc in features)
padded_features = []

for enc in features:
    # Create a new zero array of the maximum length
    padded = np.zeros(max_length)
    # Copy values from the original encoding
    padded[:len(enc)] = enc
    padded_features.append(padded)

X = np.array(padded_features)
y = np.array(labels)

In [None]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X, y)

In [5]:
X = df['sentence']
y = df['label']

In [16]:
flat_array = X.to_numpy()
flat_array = flat_array.flatten()

flat_array_clean = np.array([
    str(text) if not pd.isna(text) else "" 
    for text in flat_array
])

vectorizer = TfidfVectorizer(max_features=2500, min_df=0.0, max_df=0.8)
X_vectorized = vectorizer.fit_transform(flat_array_clean)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

In [20]:
# Initialize the classifiers
classifiers = {
    'LinearSVC': LinearSVC(C=1.0, random_state=0),
    'RandomForest': RandomForestClassifier(max_features='log2', n_estimators=1000, criterion = 'entropy', random_state = 0),
    'LogisticRegression': LogisticRegression(C=1.0, penalty='l2', solver='newton-cg', random_state = 0),
    'MultinomialNB': MultinomialNB(alpha=1, fit_prior=False),
    'DecisionTree': DecisionTreeClassifier(criterion='gini', max_features=None,min_samples_leaf=1,min_samples_split=2, random_state=0)
}

# Train and evaluate each model
accuracies = {}
predictions = {}

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracies[name] = accuracy_score(y_test, y_pred)
    predictions[name] = y_pred.astype(str)

KeyboardInterrupt: 