In [86]:
%run Preprocessing.ipynb

# Online monitoring

## - Extract peaks

In [97]:
import pandas as pd
import datetime
import time

# 2022-03-10
# 2022-04-09
# 2022-06-18

def extract_peaks():

    df = pd.read_csv('./cleaned/2022-03-cleaned.csv', index_col=False, delimiter=",")

    end_peak = '2022-03-10 23:59:59+00:00'
    start_peak = '2022-03-10 00:00:00+00:00'
    start = time.strptime(start_peak, "%Y-%m-%d %H:%M:%S+00:00")
    end = time.strptime(end_peak, "%Y-%m-%d %H:%M:%S+00:00")
    peak = df[(df['datetime'] > start_peak) & (df['datetime'] < end_peak)]

    peak.to_csv('./monitoring/period1.csv', index=False, sep=',')


    df = pd.read_csv('./cleaned/2022-04-cleaned.csv', index_col=False, delimiter=",")

    end_peak = '2022-04-09 23:59:59+00:00'
    start_peak = '2022-04-09 00:00:00+00:00'
    start = time.strptime(start_peak, "%Y-%m-%d %H:%M:%S+00:00")
    end = time.strptime(end_peak, "%Y-%m-%d %H:%M:%S+00:00")
    peak = df[(df['datetime'] > start_peak) & (df['datetime'] < end_peak)]

    peak.to_csv('./monitoring/period2.csv', index=False, sep=',')


    df = pd.read_csv('./cleaned/2022-06-cleaned.csv', index_col=False, delimiter=",")

    end_peak = '2022-06-18 23:59:59+00:00'
    start_peak = '2022-06-18 00:00:00+00:00'
    start = time.strptime(start_peak, "%Y-%m-%d %H:%M:%S+00:00")
    end = time.strptime(end_peak, "%Y-%m-%d %H:%M:%S+00:00")
    peak = df[(df['datetime'] > start_peak) & (df['datetime'] < end_peak)]

    peak.to_csv('./monitoring/period4.csv', index=False, sep=',')

    print("Peaks extracted ")

## - Extract labelled tweets

In [98]:
import pandas as pd

def extract_labeled(in_file, out_file):

    df = pd.read_csv(in_file, index_col=False, delimiter=",")

    tweets_list = []
    print(df.shape)

    labeled = df[(df.target == '0') | (df.target == '1')]
    labeled.to_csv(out_file, index=False, sep=',')

    return (labeled.shape)

# Concept drift

## - Training 

In [99]:
import time
import numpy as np
import pandas as pd
import pickle

from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectPercentile

def cd_training(path, data, i, c):

    print("Training...")
    
    tweets = data.text
    targets = data.target

    model = {'name': 'ComplementNB', 'fun': ComplementNB()}

    # model building
    model['pipeline'] = Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 1))),
                                ('tfidf', TfidfTransformer(smooth_idf=True, use_idf=True)),
                                ('fselect', SelectPercentile(chi2, percentile=85)),
                                #('fselect', SelectKBest(chi2, k='all')),      
                                ('clf', model['fun'])])

    m = model['pipeline'].fit(tweets, targets)
    
    print("Number of features: ", len(model['pipeline']['vect'].vocabulary_))
    
    # save model
    if c =='i':
        filename = model['name']+'_interval'+str(i)+'.sav'
    elif c == 's':
        filename = model['name']+'_slide'+str(i)+'.sav'
    
    pickle.dump(m, open(path+'/'+filename, 'wb'))

    print("\nModel correctly saved!\n")
    print('─' * 10)

## - Test

In [108]:
def cd_test(path, data, i, c):
    
    print("Testing...")

    tweets = data['text']
    targets = data['target']

    
    if c == 'st':
        loaded_model = pickle.load(open('models_result/85/models_85/'+model['name']+'.sav', 'rb'))
    elif c == 'i':
        loaded_model = pickle.load(open(path+model['name']+'_interval'+str(i)+'.sav', 'rb'))
    elif c == 's':
        loaded_model = pickle.load(open(path+model['name']+'_slide'+str(i)+'.sav', 'rb'))
    
    score = loaded_model.score(tweets, targets)
    print("Test score: {0:.2f} %".format(100 * score))
    y_predict = loaded_model.predict(tweets)

    rep = classification_report(targets, y_predict,
                                          target_names=['0', '1'])
    print(rep, '\n')

    # save reports
    rep = classification_report(targets, y_predict,
                                target_names=['0', '1'], output_dict=True)
    df = pd.DataFrame(rep).transpose()
    
    if c == 'st':
        df.to_csv(path+'period'+str(i)+'-report.csv')
    if c == 'i':
        df.to_csv(path+'interval'+str(i)+'-report.csv')
    elif c == 's':
        df.to_csv(path+'slide'+str(i)+'-report.csv')

## - Create window

In [101]:
def create_window(path, i, file1, file2, c):

    print("Creating window...")
    df1 = pd.read_csv(file1, index_col=False, delimiter=",")
    df1.sort_values('datetime', inplace=True, ascending=True)
    print("Previous window: ", df1.shape)
    
    if c == 's':
        if i == 1:
            data1 = df1.tail(-176)
        else:
            data1 = df1.tail(-80)
        print("Tweets deleted: ", data1.shape)
        
    data2 = pd.read_csv(file2, index_col=False, delimiter=",")
    data2.sort_values('datetime', inplace=True, ascending=True)
    print("New tweets: ", data2.shape)
    
    if c == 'i':
        window = pd.concat([df1,data2])
    elif c == 's':
        window = pd.concat([data1,data2])
        
    print("New window: ", window.shape)
    
    if c == 'i':
        window_name = path+'interval'+str(i)+'.csv'
    elif c == 's':
        window_name = path+'slide'+str(i)+'.csv'
    window.to_csv(window_name, index=False)
    
    return window_name, window


## - Get files for building a new window

In [102]:
def get_files(path, i, c):
    
    list = []
    j = i-1
    if i == 1:
        list.append('monitoring/12-01-rebalanced-only-labeled.csv')
        list.append('monitoring/2022-02-labeled-only.csv')
        
    else: # 2 3 4 5
        if c == 'i':
            list.append(path+'interval'+str(j)+'.csv')
        elif c == 's':
            list.append(path+'slide'+str(j)+'.csv')
        list.append('monitoring/period-'+str(j)+'-labeled-only.csv')
    
    return list

## - Static model

In [103]:
import pandas as pd
import string

def static_cd(model):
    
    static_path = 'monitoring/concept_drift/static/'
        
    for i in range(1,5):
        
        test_set_file = './monitoring/period-'+str(i)+'-labeled-only.csv'
        test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

        test_data = preprocess(test_data)
        test_data = elaborate(test_data)

        cd_test(static_path, test_data, i, 'st')

## - Sliding model

In [104]:
def sliding_cd(model):       
    
    sliding_path = 'monitoring/concept_drift/sliding/'

    for i in range(1,6):
        
        
        print("\n*********** SLIDING MODEL ************\n")
        
        #create window
        list = get_files(sliding_path , i, 's')  # old and new tweets
        print("Files to merge: ", list)
        file1 = list[0]
        file2 = list[1]
        
        slide_name, slide = create_window(sliding_path, i, file1, file2, 's')
        print("File created: ", slide_name, "\n")

        #train
        training_data = preprocess(slide)
        training_data = elaborate(training_data)
        
        cd_training(sliding_path, training_data, i, 's')
        
        #test on next month
        
        if i < 5:
        
            test_set_file = './labeled/period-'+str(i)+'-labeled-only.csv'
            test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

            test_data = preprocess(test_data)
            test_data = elaborate(test_data)

            cd_test(sliding_path, test_data, i, 's') 

## - Incremental model

In [109]:
def incremental_cd(model):
    
    incremental_path = 'monitoring/concept_drift/incremental/'

    for i in range(1,6):
        
        print("\n*********** INCREMENTAL MODEL ************\n")
        
        #create window
        list = get_files(incremental_path, i, 'i')  # old and new tweets
        print("Files to merge: ", list)
        file1 = list[0]
        file2 = list[1]
        
        interval_name, interval = create_window(incremental_path, i, file1, file2, 'i')
        print("File created: ", interval_name, "\n")
        
        #train
        training_data = preprocess(interval)
        training_data = elaborate(training_data)
        
        cd_training(incremental_path, training_data, i, 'i') 
        
        #test on next month
        
        if i < 5:
                    
            test_set_file = './labeled/period-'+str(i)+'-labeled-only.csv'
            test_data = pd.read_csv(test_set_file, index_col=False, delimiter=",")

            test_data = preprocess(test_data)
            test_data = elaborate(test_data)

            cd_test(incremental_path, test_data, i, 'i')

## - Concept drift main

In [1]:
import time
import numpy as np
import pandas as pd
import pickle

from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2, SelectPercentile

if __name__ == '__main__':
    
    model = {"name": "ComplementNB", "fun": ComplementNB()}
    
    static_cd(model)
    
    incremental_cd(model)
    
    sliding_cd(model)

NameError: name 'ComplementNB' is not defined