In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

X_train = pd.read_csv("X_train_5.csv")
X_validation = pd.read_csv("validation_5.csv")
X_test = pd.read_csv("test_5.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# concatenate DataFrames
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# reset index
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# ordinal encoding
le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
le.fit(X_train.values.ravel().reshape(-1, 1))
X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1))
X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1))

X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)

# function to build and evaluate classifiers
def build_and_evaluate_classifier(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    return f1, precision, recall

# classifiers
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), DecisionTreeClassifier(random_state=0)]

# train and evaluate classifiers
for name, clf in zip(names, classifiers):
    print('Now classifying', name)
    f1, precision, recall = build_and_evaluate_classifier(clf, X_train_enc, X_test_enc, y_train, y_test)
    print("\tAverage F1 for {}:\t\t".format(name), f1)
    print("\tAverage Precision for {}:\t".format(name), precision)
    print("\tAverage Recall for {}:\t\t".format(name), recall)

# final classification report
y_pred = classifiers[-1].predict(X_test_enc)  # Using the last classifier for classification report
print(classification_report(y_test, y_pred))


  X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.38143285867779825
	Average Precision for Naive_Bayes:	 0.38143285867779825
	Average Recall for Naive_Bayes:		 0.38143285867779825
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7819136892632341
	Average Precision for Decision_Tree:	 0.7819136892632341
	Average Recall for Decision_Tree:		 0.7819136892632341
              precision    recall  f1-score   support

         adj       0.73      0.66      0.69     25402
        adja       0.15      0.20      0.17        89
        adjc       0.36      0.50      0.42        18
        adjp       0.63      0.65      0.64       156
         adv       0.76      0.72      0.74      8395
        aglt       0.94      0.89      0.91      1599
      bedzie       0.93      0.95      0.94       582
        brev       0.69      0.76      0.72      2123
        burk       0.63      0.57      0.60        21
        comp       0.94      0.93      0.93      3554
        conj      

In [2]:
X_train = pd.read_csv("X_train_9.csv")
X_validation = pd.read_csv("validation_9.csv")
X_test = pd.read_csv("test_9.csv")

y_train = pd.read_csv("first_label")
y_validation = pd.read_csv("validation_first_label")
y_test = pd.read_csv("test_first_label")

# reset index for consistency
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

# concatenate DataFrames
X_train = pd.concat([X_train, X_validation])
y_train = pd.concat([y_train, y_validation])

# ordinal encoding
le = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
le.fit(X_train.values.ravel().reshape(-1, 1))

# encode features
X_train_enc = le.transform(X_train.values.ravel().reshape(-1, 1))
X_test_enc = le.transform(X_test.values.ravel().reshape(-1, 1))

# reshape encoded features
X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)

# function to train and evaluate classifiers
def buildClassifiers(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")
    return f1, precision, recall

# construct classifiers
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), DecisionTreeClassifier(random_state=0)]

# Train and evaluate classifiers
for name, clf in zip(names, classifiers):
    print('Now classifying', name)
    f1, precision, recall = buildClassifiers(clf, X_train_enc, X_test_enc, y_train, y_test)
    print("\tAverage F1 for {}:\t\t".format(name), f1)
    print("\tAverage Precision for {}:\t".format(name), precision)
    print("\tAverage Recall for {}:\t\t".format(name), recall)

# assuming 'SplitCTag' column contains lists
df = pd.read_csv("dataframe.csv")
df['SplitCTag'] = df['SplitCTag'].apply(lambda x: x[0] if isinstance(x, list) else x)

# print value counts of 'SplitCTag'
print(df['SplitCTag'].astype("category").value_counts())


  X_train_enc = X_train_enc.squeeze().reshape(X_train.shape[0], -1).astype(int)
  X_test_enc = X_test_enc.squeeze().reshape(X_test.shape[0], -1).astype(int)
  y = column_or_1d(y, warn=True)


Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.37886815444212535
	Average Precision for Naive_Bayes:	 0.37886815444212535
	Average Recall for Naive_Bayes:		 0.37886815444212535
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.7503015482909519
	Average Precision for Decision_Tree:	 0.7503015482909519
	Average Recall for Decision_Tree:		 0.7503015482909519
SplitCTag
['interp']                                       133606
['qub']                                           41079
['conj']                                          26814
['fin', 'sg', 'ter', 'imperf']                    17872
['subst', 'sg', 'gen', 'f']                       17788
                                                  ...  
['ppron12', 'sg', 'acc', 'm3', 'sec', 'nakc']         1
['ppron12', 'sg', 'acc', 'n', 'pri', 'akc']           1
['ppron12', 'sg', 'acc', 'n', 'sec', 'nakc']          1
['pact', 'sg', 'loc', 'm3', 'imperf', 'neg']          1
['ppas', 'pl', 'loc', 'n', 'perf', 'neg']  