## Import Required Library 

In [105]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score,accuracy_score,recall_score, precision_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
import package  as pkg
import pickle
import re
import sys
import numpy as np

## Load prepared Dataset ['Negative','Positive','Neutral']

In [11]:
df_train = pd.read_csv('Amh-Dataset/Training-dataset - Sheet1.csv')
df_test = pd.read_csv('Amh-Dataset/Test-dataset - Sheet1(1).csv')

## View the first 5 row dataset value 

In [12]:
print((df_train.head()))

                                    text sentiment
0  ባለውለታህን በእርግጫ ከመግፋት በብልሀት ሥርዓት አስይዘው፣  Positive
1                     የታመነ ሰው እጅግ ይባረካል፤  Positive
2                ትንሽ ሰው ትንሽ ነው አንሶ ያሳንሳል  Negative
3          ሆዳሞች የሚወረወርላቸውን ፍርፋሪ ካገኙ ይጮሃሉ  Negative
4     የአንዳንድ ዘረኞች አስተሳሰብ እጅግ አድርጎ ይደንቀኛል   Neutral


In [13]:
print((df_test.head()))

                                                text sentiment
0                       እንደ ማስቲካ ታኝከን እንድንጣል ኣንፈልግም።   Neutral
1  ስለዚህ ወያነ ከሃይማኖታችን እጁን ካላነሳ እኛ እንበቃዋለንና እጃችሁ ኣን...  Negative
2                              እፍሬም ተብዬው አፍህን በትከፍተው  Negative
3                     ድል ለመላው ህዝብ በወያኔ ግፍ ለሚሰቃዩት ሁሉ።   Neutral
4                                ማረምያ ቤቱ ለምን ተሰራ ታድያ   Neutral


## view the Number of total rows and columens

In [14]:
print(df_train.shape)

(629, 2)


In [15]:
print(df_test.shape)

(201, 2)


## view Null Data

In [17]:
print(df_train.dropna.sum())

AttributeError: 'function' object has no attribute 'sum'

## Data Preprocessing (clean data, duplicate data, incomplete data,null data)

In [28]:
df_train = pkg.clean_df(df_train)
df_test = pkg.clean_df(df_test)

In [31]:
clear_config = {
    'remove_url': True,
    'remove_mentions': True,
    'lowercase': True,
    'demojify': True
}

## Remove emojis, Mentions, URL from the dataset if available

In [32]:
df_train['text'] = df_train['text'].apply(pkg.clean_text, args=(clear_config,))
df_test['text'] = df_test['text'].apply(pkg.clean_text, args=(clear_config,))

## Clean and normalize characterset from the dataset 
## method to normalize a character level mismatch such as ጸሀይ and ፀሐይ

In [34]:
df_train['text'] = df_train['text'].apply(lambda x: pkg.normalize_char_level_missmatch(x))
df_test['text'] = df_test['text'].apply(lambda x: pkg.normalize_char_level_missmatch(x))

## Test using Naive Bayes Machine Algorithm and CounterVectorizer

## Using CounterVector feature extraction by converting data into numerical 

### assign independent data for training

In [58]:
#independent data
X_train = df_train['text'].values
#dependent data (target data )
y_train = df_train['sentiment'].values

In [72]:
X_test, y_test = df_test['text'].values, df_test['sentiment'].values


## feature extraction from text and transform to numeric value


In [95]:
matrix = CountVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3), lowercase=True)

print("Vocabulary...")
vector=matrix.fit( df_train['text'])
X_train=matrix.fit_transform( df_train['text']).toarray() #scaling
X_test=matrix.fit_transform(df_test['text']).toarray()
np.set_printoptions(threshold=sys.maxsize)

#document = ["የታመነ ሰው እጅግ ይባረካል"]

for i in range(2):
    print(X_train[i])

Vocabulary...
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 

### Initialize and train the model

In [96]:


#document = ["የታመነ ሰው እጅግ ይባረካል"]

print("*****Gaussian  Algorithm*****")
classifier = GaussianNB()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

*****Gaussian  Algorithm*****


## Measure and Evaluate Accuracy

In [97]:
#measure accuracy
#accuracy
acc = accuracy_score(y_test,y_pred)
print("the accuracy value is: {}%".format(100*acc))

the accuracy value is: 40.4040404040404%


### The measured value by GaussianNB machine algorithm is 40.40%. it is low accuracy

### calculate the precision, f1_score , recall of the machine algorithms (confusion report)

In [98]:
#classification report and confusion matrix
print(classification_report(y_test,y_pred, target_names=['Positive','Negative','Neutral']))

              precision    recall  f1-score   support

    Positive       0.40      0.24      0.30        67
    Negative       0.49      0.45      0.47        87
     Neutral       0.32      0.57      0.41        44

    accuracy                           0.40       198
   macro avg       0.40      0.42      0.39       198
weighted avg       0.42      0.40      0.40       198



## save the model created by GausianNB using pickle module

In [104]:
#make a pickle file for model
pickle.dump(classifier, open("model.pkl","wb"))
print(" GausianNB model saved successfuly  ")

 GausianNB model saved successfuly  


## Feature Extraction using Tfidf and transform to numeric

In [109]:
vectorizer = TfidfVectorizer()

vectorizer.fit(df_train['text'])
X = vectorizer.transform(df_train['text']).toarray()
y = matrix.transform(df_test['text']).toarray()
trainX, testX, trainY,testY = train_test_split(X, y_train,test_size=0.2, stratify=y_train, random_state=2)

## LogisticRegression Machine algorithm

In [111]:
logic_clf = LogisticRegression()
logic_clf.fit(trainX,trainY)
X_train_prediction = logic_clf.predict(trainX)
training_data_accuracy = accuracy_score(X_train_prediction, trainY)
print("Training Accuracy score ", training_data_accuracy*100)

Accuracy score  76.8762677484787




## the accuracy value get using the Logistic Regression is very high and it is a good solution and high accuracy

### Testing Accuracy score

In [113]:
print("Testing Data accuracy")
X_test_prediction = logic_clf.predict(testX)
testing_data_accuracy = accuracy_score(X_test_prediction, testY)
print("Accuracy score test ", testing_data_accuracy *100)

Testing Data accuracy
Accuracy score test  52.41935483870967


### predict the text using the above model

In [133]:
x_pred = logic_clf.predict(trainX)
print(x_pred[52])

Negative
