In [13]:
%run Preprocessing.ipynb

# Online monitoring

## - Extract peaks

In [14]:
import pandas as pd
import datetime
import time

# 2022-03-10
# 2022-04-09
# 2022-06-18
# 2022-07-20

def extract_peaks():
    
    peaks = {'2022-03': '2022-03-10', 
             '2022-04': '2022-04-09',
             '2022-06': '2022-06-18', 
             '2022-07': '2022-07-20'}

    i = 0
    for key, value in peaks.items():
        
        # may is period3
        if i == 2:
            i+=1
        i+=1
    
        df = pd.read_csv('cleaned/'+key+'-cleaned.csv', index_col=False, delimiter=",")

        end_peak = value+' 23:59:59+00:00'
        start_peak = value+' 00:00:00+00:00'
        start = time.strptime(start_peak, "%Y-%m-%d %H:%M:%S+00:00")
        end = time.strptime(end_peak, "%Y-%m-%d %H:%M:%S+00:00")
        peak = df[(df['datetime'] > start_peak) & (df['datetime'] < end_peak)]

        peak.to_csv('./monitoring/period'+str(i)+'.csv', index=False, sep=',')
    
    print("Peaks extracted ")

## - Extract labelled tweets

In [15]:
import pandas as pd

def extract_labeled(in_file, out_file):

    df = pd.read_csv(in_file, index_col=False, delimiter=",")

    tweets_list = []
    print(df.shape)

    labeled = df[(df.target == '0') | (df.target == '1')]
    labeled.to_csv(out_file, index=False, sep=',')

    return (labeled.shape)

# Concept drift

## - Training 

In [16]:
import time
import numpy as np
import pandas as pd
import pickle

from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectPercentile

def cd_training(path, data, i, c):

    print("Training...")
    
    tweets = data.text
    targets = data.target
    perc = 75
    model = {'name': 'ComplementNB', 'fun': ComplementNB()}

    # model building
    model['pipeline'] = Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 1))),
                                ('tfidf', TfidfTransformer(smooth_idf=True, use_idf=True)),
                                ('fselect', SelectPercentile(chi2, percentile=perc)),
                                #('fselect', SelectKBest(chi2, k='all')),      
                                ('clf', model['fun'])])

    m = model['pipeline'].fit(tweets, targets)
    
    print("Number of features: ", len(model['pipeline']['vect'].vocabulary_))
    
    # save model
    if c =='i':
        filename = model['name']+'_interval'+str(i)+'.sav'
    elif c == 's':
        filename = model['name']+'_slide'+str(i)+'.sav'
    
    pickle.dump(m, open(path+'/'+filename, 'wb'))

    print("\nModel correctly saved!\n")
    print('─' * 10)

## - Test

In [17]:
def cd_test(path, data, i, c):
    
    print("Testing...")

    tweets = data['text']
    targets = data['target']

    
    if c == 'st':
        loaded_model = pickle.load(open('models_result/75/models_75/'+model['name']+'.sav', 'rb'))
    elif c == 'i':
        loaded_model = pickle.load(open(path+model['name']+'_interval'+str(i)+'.sav', 'rb'))
    elif c == 's':
        loaded_model = pickle.load(open(path+model['name']+'_slide'+str(i)+'.sav', 'rb'))
    
    print("Number of features: ", len(loaded_model['vect'].vocabulary_))

    score = loaded_model.score(tweets, targets)
    print("Test score: {0:.2f} %".format(100 * score))
    y_predict = loaded_model.predict(tweets)

    rep = classification_report(targets, y_predict,
                                          target_names=['0', '1'])
    print(rep, '\n')

    # save reports
    rep = classification_report(targets, y_predict,
                                target_names=['0', '1'], output_dict=True)
    df = pd.DataFrame(rep).transpose()
    
    if c == 'st':
        df.to_csv(path+'period'+str(i)+'-report-ComplementNB.csv')
    if c == 'i':
        df.to_csv(path+'interval'+str(i)+'-report-ComplementNB.csv')
    elif c == 's':
        df.to_csv(path+'slide'+str(i)+'-report-ComplementNB.csv')

## - Create window

In [35]:
def create_window(path, i, file1, file2, c):

    print("Creating window...")
    df1 = pd.read_csv(file1, index_col=False, delimiter=",")
    df1.sort_values('datetime', inplace=True, ascending=True)
    print("Previous window: ", df1.shape)
    
    if c == 's':
        if i == 1:
            data1 = df1.tail(-176)
        elif i == 6:
            data1 = df1.tail(-60)
        else:
            data1 = df1.tail(-80)
        print("Tweets deleted: ", data1.shape)
        
    data2 = pd.read_csv(file2, index_col=False, delimiter=",")
    data2.sort_values('datetime', inplace=True, ascending=True)
    print("New tweets: ", data2.shape)
    
    if c == 'i':
        window = pd.concat([df1,data2])
    elif c == 's':
        window = pd.concat([data1,data2])
        
    print("New window: ", window.shape)
    
    if c == 'i':
        window_name = path+'interval'+str(i)+'.csv'
    elif c == 's':
        window_name = path+'slide'+str(i)+'.csv'
    window.to_csv(window_name, index=False)
    
    return window_name, window


## - Get files for building a new window

In [19]:
def get_files(path, i, c):
    
    list = []
    j = i-1
    if i == 1:
        list.append('monitoring/2021-12-01-labeled-only.csv')
        list.append('monitoring/2022-02-labeled-only.csv')
        
    else: # 2 3 4 5 6
        if c == 'i':
            list.append(path+'interval'+str(j)+'.csv')
        elif c == 's':
            list.append(path+'slide'+str(j)+'.csv')
        list.append('labeled/period'+str(j)+'-labeled-only.csv')
    
    return list

## - Static model

In [20]:
import pandas as pd
import string

def static_cd(model):
    
    static_path = 'monitoring/concept_drift/static/'
        
    for i in range(1,6):
        
        print("\n*********** STATIC MODEL ************\n")
        
        test_set_file = './labeled/period'+str(i)+'-labeled-only.csv'
        test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

        test_data = preprocess(test_data)
        test_data = elaborate(test_data)

        cd_test(static_path, test_data, i, 'st')

## - Sliding model

In [21]:
def sliding_cd(model):       
    
    sliding_path = 'monitoring/concept_drift/sliding/'

    for i in range(1,7):
        
        print("\n*********** SLIDING MODEL ************\n")
        
        #create window
        list = get_files(sliding_path , i, 's')  # old and new tweets
        print("Files to merge: ", list)
        file1 = list[0]
        file2 = list[1]
        
        slide_name, slide = create_window(sliding_path, i, file1, file2, 's')
        print("File created: ", slide_name, "\n")

        #train
        training_data = preprocess(slide)
        training_data = elaborate(training_data)
        
        cd_training(sliding_path, training_data, i, 's')
        
        #test on next month
        
        if i < 6:
        
            test_set_file = './labeled/period'+str(i)+'-labeled-only.csv'
            test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

            test_data = preprocess(test_data)
            test_data = elaborate(test_data)

            cd_test(sliding_path, test_data, i, 's') 

## - Incremental model

In [22]:
def incremental_cd(model):
    
    incremental_path = 'monitoring/concept_drift/incremental/'

    for i in range(1,7):
        
        print("\n*********** INCREMENTAL MODEL ************\n")
        
        #create window
        list = get_files(incremental_path, i, 'i')  # old and new tweets
        print("Files to merge: ", list)
        file1 = list[0]
        file2 = list[1]
        
        interval_name, interval = create_window(incremental_path, i, file1, file2, 'i')
        print("File created: ", interval_name, "\n")
        
        #train
        training_data = preprocess(interval)
        training_data = elaborate(training_data)
        
        cd_training(incremental_path, training_data, i, 'i') 
        
        #test on next month
        
        if i < 6:
                    
            test_set_file = './labeled/period'+str(i)+'-labeled-only.csv'
            test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

            test_data = preprocess(test_data)
            test_data = elaborate(test_data)

            cd_test(incremental_path, test_data, i, 'i')

## - Concept drift main

In [36]:
import time
import numpy as np
import pandas as pd
import pickle

from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectPercentile

if __name__ == '__main__':
    
    model =  {"name": "ComplementNB", "fun": ComplementNB()}
    
    static_cd(model)
    
    incremental_cd(model)
    
    sliding_cd(model)


*********** STATIC MODEL ************

Preprocessing done
Elaboration done


Testing...
Number of features:  4669
Test score: 73.75 %
              precision    recall  f1-score   support

           0       0.74      0.72      0.73        40
           1       0.73      0.75      0.74        40

    accuracy                           0.74        80
   macro avg       0.74      0.74      0.74        80
weighted avg       0.74      0.74      0.74        80
 


*********** STATIC MODEL ************

Preprocessing done
Elaboration done


Testing...
Number of features:  4669
Test score: 72.50 %
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        40
           1       0.75      0.68      0.71        40

    accuracy                           0.73        80
   macro avg       0.73      0.73      0.72        80
weighted avg       0.73      0.72      0.72        80
 


*********** STATIC MODEL ************

Preprocessing done
Elaboration d

Elaboration done


Training...
Number of features:  4759

Model correctly saved!

──────────
Preprocessing done
Elaboration done


Testing...
Number of features:  4759
Test score: 71.25 %
              precision    recall  f1-score   support

           0       0.68      0.80      0.74        40
           1       0.76      0.62      0.68        40

    accuracy                           0.71        80
   macro avg       0.72      0.71      0.71        80
weighted avg       0.72      0.71      0.71        80
 


*********** SLIDING MODEL ************

Files to merge:  ['monitoring/concept_drift/sliding/slide3.csv', 'labeled/period3-labeled-only.csv']
Creating window...
Previous window:  (1576, 4)
Tweets deleted:  (1496, 4)
New tweets:  (80, 4)
New window:  (1576, 4)
File created:  monitoring/concept_drift/sliding/slide4.csv 

Preprocessing done
Elaboration done


Training...
Number of features:  4685

Model correctly saved!

──────────
Preprocessing done
Elaboration done


Testing...
N