# **GASTIADI PROJECT**



<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/drive/19TMQXrycBhoJoWZcjAuHqd1EF-yexMlr?usp=sharing">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
  </td>
  <td>
    <a target="_blank" href="https://github.com/vitoatmo/gastiadi-project/blob/main/gastiadi.ipynb">
    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />
    View source on GitHub</a>
  </td>
  <td>
  <img src="https://img.shields.io/github/watchers/vitoatmo/gastiadi-project?style=social" />
  </td>
</table>

### Install the required packages


### Imports

In [None]:
import numpy as np
from numpy.random import RandomState
import pandas as pd
import os
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence

### Prep work: Downloading necessary files
Before we get started, we need to store all of the data we'll be using.
* **sentiment500-subset.csv:** cleaned subset of Sentiment1000 data - as positive or negative


import the data in session storage and then copy the path of each data 


In [None]:
!git clone https://github.com/vitoatmo/gastiadi-project.git 

In [None]:
dataset = ('/content/gastiadi-project/dataset/datset_fix.csv')

In [None]:
dataset= pd.read_csv(dataset)

In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
print((dataset.Label == 1).sum()) #urgent
print((dataset.Label == 0).sum()) #unurgent

In [None]:
dataset.Text

In [None]:
from collections import Counter

#Count Unique Word

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word]+=1
    return count
  
counter = counter_word(dataset.Text)

In [None]:
len(counter)

In [None]:
#Split Dataset Into Training and Validation Set
import numpy
train_size = int(dataset.shape[0]*0.8)

train_ds = dataset[:train_size]
val_ds = dataset[train_size:]

#Split Text and Label
train_sentences = train_ds.Text.to_numpy()
train_labels = train_ds.Label.to_numpy()
val_sentences = val_ds.Text.to_numpy()
val_labels = val_ds.Label.to_numpy()

In [None]:
train_sentences.shape, val_sentences.shape

In [None]:
#Tokenizer
from keras.preprocessing.text import Tokenizer

#Vectorize a text corpus by turning each text into a sequence of integers
num_unique_words=len(counter)
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) #fit only to training

In [None]:
#each word has uniwue index
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
train_sentences=tokenizer.texts_to_sequences(train_sentences)
val_sentences=tokenizer.texts_to_sequences(val_sentences)

In [None]:
print(train_sentences[10:15])
print(val_sentences[10:15])

In [None]:
#Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Max number of words in a sequence
max_length = 20

train_padded =pad_sequences(train_sentences, maxlen=max_length, padding='post',truncating='post')
val_padded =pad_sequences(val_sentences, maxlen=max_length, padding='post',truncating='post')
train_padded.shape , val_padded.shape

In [None]:
#Check the padding
train_padded[10]

In [None]:
#Create Model
from tensorflow.keras import layers
from tensorflow import keras

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, activation='relu', dropout=0.1))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()


In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.01)
metrics = ['accuracy']

model.compile(loss= loss, optimizer=optim, metrics=metrics)

In [None]:
history = model.fit(train_padded, train_labels,epochs =20, validation_data=(val_padded,val_labels),verbose=2)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
  
print(plot_history)

In [None]:
model.evaluate(train_padded)

In [None]:
predictions = model.predict(train_padded)
predictions = [1 if p> 0.5 else 0 for p in predictions]

In [None]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

#Fitting model with trainig data
regressor.fit(train_padded, train_labels)

In [None]:
model.save("model_gastiadi")

In [None]:
import shutil
shutil.make_archive('model_gastiadi', 'zip', 'model_gastiadi')

In [None]:
from google.colab import files
files.download("/content/model_gastiadi.zip")