In [22]:
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import f1_score, accuracy_score, classification_report

def normalize_data(train_data, test_data, type=None):
    if type is None:
        return train_data, test_data
    elif type == "standard":
        scaler = sklearn.preprocessing.StandardScaler()
    elif type == "min_max":
        scaler = sklearn.preprocessing.MinMaxScaler()
    elif type == "l1" or type == "l2":
        scaler = sklearn.preprocessing.Normalizer(type)
    
    if scaler is not None:
        scaler.fit(train_data)
        return scaler.transform(train_data), scaler.transform(test_data)
    else:
        return train_data, test_data

In [23]:
train_sentences = np.load("data/training_sentences.npy", allow_pickle = True)
train_labels = np.load("data/training_labels.npy", allow_pickle = True)
test_sentences = np.load("data/test_sentences.npy", allow_pickle = True)
test_labels = np.load("data/test_labels.npy", allow_pickle = True)

In [24]:
train_sentences[:10]

array([list(['Probably', 'not', 'still', 'going', 'over', 'some', 'stuff', 'here']),
       list(['I', 'HAVE', 'A', 'DATE', 'ON', 'SUNDAY', 'WITH', 'WILL']),
       list(['Thanks', '4', 'your', 'continued', 'support', 'Your', 'question', 'this', 'week', 'will', 'enter', 'u', 'in2', 'our', 'draw', '4', 'Â£100', 'cash', 'Name', 'the', 'NEW', 'US', 'President', 'txt', 'ans', 'to', '80082']),
       list(['Dear', '0776xxxxxxx', 'Uve', 'been', 'invited', 'to', 'XCHAT', 'This', 'is', 'our', 'final', 'attempt', 'to', 'contact', 'u', 'Txt', 'CHAT', 'to', '86688', '150pMsgrcvdHGSuite3422LandsRowW1J6HL', 'LDN', '18yrs']),
       list(['I', 'sent', 'my', 'scores', 'to', 'sophas', 'and', 'i', 'had', 'to', 'do', 'secondary', 'application', 'for', 'a', 'few', 'schools', 'I', 'think', 'if', 'you', 'are', 'thinking', 'of', 'applying', 'do', 'a', 'research', 'on', 'cost', 'also', 'Contact', 'joke', 'ogunrinde', 'her', 'school', 'is', 'one', 'me', 'the', 'less', 'expensive', 'ones']),
       list(['Koth

In [25]:
train_labels[:10]

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0])

In [26]:
class BagOfWords:
    
    '''
    Initializare dictionar gol
    '''
    
    def __init__(self):
        self.vocab = {}
        self.words = []
        
    '''
    Construiti dicionarul si data
    
    data - lista de liste de cuvinte
    '''
    
    def build_dictionary(self, data):
        for sentence in data:
            for word in sentence:
                if word not in self.vocab:
                    self.vocab[word] = len(self.words)
                    self.words.append(word)
        print(len(self.words))
        
    #N propozitii -> Nx9522
    #pt fiecare prop am omatrice de zero-uri
    def get_features(self, data):
        features = np.zeros((data.shape[0], len(self.words)))
        for idx, sentence in enumerate(data):
            for word in sentence:
                if word in self.vocab:
                    features[idx, self.vocab[word]] += 1 #am marcat cuvintele care exista
        return features

In [27]:
bow = BagOfWords()
bow.build_dictionary(train_sentences)

train_features = bow.get_features(train_sentences)
test_features = bow.get_features(test_sentences)

print(train_features.shape)
print(test_features.shape)

9522
(3734, 9522)
(1840, 9522)


In [28]:
train_features[0]

array([1., 1., 1., ..., 0., 0., 0.])

In [29]:
train_features_normalized, test_features_normalized = normalize_data(train_features, test_features, 'l2')

In [30]:
train_features_normalized[0, :10]

array([0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
       0.35355339, 0.35355339, 0.35355339, 0.        , 0.        ])

In [31]:
train_features_normalized, test_features_normalized = normalize_data(train_features, test_features, 'l1')

In [32]:
train_features_normalized[0, :10]

array([0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.   ,
       0.   ])

In [33]:
train_features_normalized, test_features_normalized = normalize_data(train_features, test_features, 'min_max')

In [34]:
train_features_normalized[0, :10]

array([1.        , 0.33333333, 1.        , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.        , 0.        ])

In [37]:
model = svm.SVC(C=1.0, kernel='linear')

model.fit(train_features_normalized, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [39]:
test_preds = model.predict(test_features_normalized)

accuracy_score(test_labels, test_preds)

0.9815217391304348

In [41]:
f1_score(test_labels, test_preds)

0.9285714285714286

In [42]:
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1585
           1       1.00      0.87      0.93       255

    accuracy                           0.98      1840
   macro avg       0.99      0.93      0.96      1840
weighted avg       0.98      0.98      0.98      1840



In [43]:
print(model.coef_.shape)

(1, 9522)


In [44]:
weights = np.squeeze(model.coef_)
print(weights)

[-0.05515992 -0.02012771 -0.14455944 ... -0.01373508 -0.01373508
  0.        ]


In [46]:
idxes = np.argsort(weights)
# print(idxes)
words = np.array(bow.words)

print('Negative', words[idxes[:10]])
print('Positive', words[idxes[-10:]])

Negative ['me' 'taken' 'I' 'work' 'him' 'favour' 'Im' 'Lmaonice' 'Oh' 'infront']
Positive ['FREE>RingtoneReply' '85233' 'won' '1' '08718738034' 'mobile' 'REAL'
 '02070836089' '84484' 'ringtoneking']
