In [1]:
# dataframe management
import pandas as pd             


# numerical computation
import numpy as np

from sklearn import preprocessing

# visualization library
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   


# import matplotlib and allow it to plot inline
import matplotlib.pyplot as plt
%matplotlib inline

# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

#in order to prin all the columns
pd.set_option('display.max_columns', 100)

In [3]:
#reading the datasets
bonifici = pd.read_csv("datasets/quiubi_bonifici.csv")
segnalaz = pd.read_csv("datasets/bonifici_segnalaz.csv")
bonifici.set_index('indice',inplace=True)
segnalaz.set_index('indice',inplace=True)

In [4]:
# dropping columns with useless data
bonifici = bonifici.drop(["CAP", "Servizio", "Status", "Paese", "Provincia", "Nazione", "IDTransazione", "CRO", "Causale", "Valuta", "ProfSicurezza", "NumConto", "ABI", "CAB", "Intestatario", "Indirizzo"], axis=1)
bonifici = bonifici.drop(["MsgErrore", "Nominativo", "TipoOperazione"], axis=1)
segnalaz = segnalaz.drop(["CAP", "Servizio", "Status", "Paese", "Provincia", "Nazione", "IDTransazione", "CRO", "Causale", "Valuta", "ProfSicurezza", "NumConto", "ABI", "CAB", "Intestatario", "Indirizzo"], axis=1)
segnalaz = segnalaz.drop(["MsgErrore", "Nominativo", "TipoOperazione"], axis=1)

In [5]:
# c'è un indice duplicato -> eliminalo
# NB: NON E' IL MODO MIGLIORE (GLI INDICI SONO DUPLICATI, MA LE TRANSAZIONI DIVERSE)
bonifici = bonifici[~bonifici.index.duplicated()]

In [6]:
# datasets merge into bonifici
bonifici["isFraud"] = np.zeros(len(bonifici.index))
for index, row in segnalaz.iterrows():
    if index in bonifici.index:
        bonifici.loc[index, "isFraud"] = 1
    else:
        # print(index)
        bonifici.append(row)

bonifici["isFraud"] = pd.to_numeric(bonifici["isFraud"], downcast='integer')

# Min Preprocessing

In [7]:
bonifici.Timestamp = pd.to_datetime(bonifici.Timestamp)
bonifici.NumConfermaSMS = bonifici.NumConfermaSMS.eq('Si').astype(int)

In [8]:
bonifici["Importo"].to_numpy().reshape(-1, 1)

array([[1525. ],
       [4653.9],
       [ 600. ],
       ...,
       [ 500. ],
       [ 279.9],
       [ 488. ]])

In [458]:
x = bonifici[["Importo"]].to_numpy()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
bonifici[["Importo"]] = x_scaled

In [459]:
# Labels are the values we want to predict
labels = np.array(bonifici['isFraud'])
# Remove the labels from the features
bonifici = bonifici[["Importo", "NumConfermaSMS"]]
# Saving feature names for later use
feature_list = ["Importo", "NumConfermaSMS"]
# Convert to numpy array
features = np.array(bonifici)

In [460]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [461]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (362181, 2)
Training Labels Shape: (362181,)
Testing Features Shape: (120727, 2)
Testing Labels Shape: (120727,)


In [462]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [463]:
rf.predict(test_features)

array([0.        , 0.00016463, 0.        , ..., 0.        , 0.00016463,
       0.        ])

In [9]:
# LSTM for international airline passengers problem with time step regression framing
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    print(dataX)
    print("---------------")
    print(dataY)
    return numpy.array(dataX), numpy.array(dataY)


# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset
dataframe = read_csv('datasets/airline-passengers.csv', usecols=[1], engine='python')
dataset = dataframe.values
dataset = dataset.astype('float32')
# normalize the dataset
# split into train and test sets
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
# reshape into X=t and Y=t+1
look_back = 2
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))

[array([112., 118.], dtype=float32), array([118., 132.], dtype=float32), array([132., 129.], dtype=float32), array([129., 121.], dtype=float32), array([121., 135.], dtype=float32), array([135., 148.], dtype=float32), array([148., 148.], dtype=float32), array([148., 136.], dtype=float32), array([136., 119.], dtype=float32), array([119., 104.], dtype=float32), array([104., 118.], dtype=float32), array([118., 115.], dtype=float32), array([115., 126.], dtype=float32), array([126., 141.], dtype=float32), array([141., 135.], dtype=float32), array([135., 125.], dtype=float32), array([125., 149.], dtype=float32), array([149., 170.], dtype=float32), array([170., 170.], dtype=float32), array([170., 158.], dtype=float32), array([158., 133.], dtype=float32), array([133., 114.], dtype=float32), array([114., 140.], dtype=float32), array([140., 145.], dtype=float32), array([145., 150.], dtype=float32), array([150., 178.], dtype=float32), array([178., 163.], dtype=float32), array([163., 172.], dtype=f