Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/irio-hackatonny' into irio-hacka…
Browse files Browse the repository at this point in the history
…tonny
  • Loading branch information
cuducos committed Jan 2, 2017
2 parents 5eb9e0f + 871329a commit 325b5e8
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 10 deletions.
Empty file added lead-scoring/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions lead-scoring/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from collections import OrderedDict
import math
import os.path

import pandas as pd
import numpy as np

DATA_PATH = '/Users/irio/Desktop/serenata-data'
DISPLAY_KEYS = OrderedDict([
('issue_date', 'Data do gasto'),
('congressperson_name', 'Deputado'),
('total_net_value', 'Valor'),
('url', 'URL'),
('meal_price_outlier', 'Preço de refeição suspeito?'),
('over_monthly_subquota_limit', 'Acima da subcota?'),
('suspicious_traveled_speed_day', 'Distância viajada suspeita?'),
('has_receipt', 'Tem recibo?'),
('is_in_office', 'Em mandato?'),
('rosie_score', 'Nível de suspeita'),
('score', 'Ranking'),
('year', 'Ano'),
('document_id', 'ID'),
('applicant_id', 'ID Deputado'),
])


def full_path(path):
return os.path.join(DATA_PATH, path)


def display(dataset):
data = dataset.copy()
data['issue_date'] = data['issue_date'].str[:10]
data['url'] = data['document_id'] \
.apply(lambda x: 'https://jarbas.datasciencebr.com/#/documentId/{}'.format(x))
data['rosie_score'] = data['rosie_score'].apply(__display_percentage)
data['score'] = data['score'].apply(__display_percentage)
data['total_net_value'] = data['total_net_value'] \
.apply(lambda x: 'R$ {0:.2f}'.format(x))
data = data[[k for k in DISPLAY_KEYS.keys()]]
data.rename(columns=DISPLAY_KEYS, inplace=True)
return data

def __display_percentage(values):
return '{0:.2f}%'.format(values * 100)

def ranking():
data = __irregularities()
data = pd.merge(data, __is_in_office(data))
data['has_receipt'] = data['year'] > 2011
data['score'] = __score(data)
data = data.sort_values(['is_in_office', 'has_receipt', 'score'],
ascending=[False, False, False])
remove_receipts_from_same_case(data)
return display(data)

def remove_receipts_from_same_case(data):
speed_day_keys = ['applicant_id',
'issue_date',
'suspicious_traveled_speed_day']
subquota_keys = ['applicant_id',
'month',
'over_monthly_subquota_limit']
data.drop_duplicates(speed_day_keys, inplace=True)
data.drop_duplicates(subquota_keys, inplace=True)
return data

def __is_in_office(data):
return data \
.groupby('applicant_id') \
.apply(lambda x: x['year'].max() >= 2015) \
.reset_index() \
.rename(columns={0: 'is_in_office'})


def __score(data):
data['rosie_score'] = __rosie_score(data)
net_value_score = data['total_net_value'].apply(math.log) / \
math.log(data['total_net_value'].max())
return .5 * data['rosie_score'] + .5 * net_value_score


def __rosie_score(data):
return .5 * data['meal_price_outlier_probability'] + \
.3 * data['suspicious_traveled_speed_day_probability'] + \
.2 * data['over_monthly_subquota_limit_probability']


def __irregularities():
data = pd.read_csv(full_path('irregularities.xz'),
low_memory=False)
is_valid_suspicion = data.select_dtypes(include=[np.bool]).any(axis=1)
data = data[is_valid_suspicion]
reimbursements = pd.read_csv(full_path('2016-12-06-reimbursements.xz'),
low_memory=False)
reimbursements = reimbursements.query('congressperson_id.notnull()')
return pd.merge(data, reimbursements)
13 changes: 13 additions & 0 deletions lead-scoring/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from flask import Flask

from dataset import full_path, ranking

ranking().to_csv(full_path('ranking.csv'), index=False)
app = Flask(__name__)

@app.route('/')
def hello():
return 'Hello World!'

if __name__ == '__main__':
app.run()
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
Flask==0.11.1
geopy>=1.11.0
pymongo==3.4.0
scikit-learn>=0.17
scipy>=0.18
geopy>=1.11.0
4 changes: 4 additions & 0 deletions rosie/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os.path

import numpy as np
import pandas as pd
from sklearn.externals import joblib

from rosie.dataset import Dataset
Expand Down Expand Up @@ -52,7 +53,10 @@ def load_trained_model(self, classifier):
def predict(self, model, irregularity):
model.transform(self.dataset)
y = model.predict(self.dataset)
probabilities = model.predict_proba(self.dataset)
self.irregularities[irregularity] = y
self.irregularities[irregularity + '_probability'] = \
pd.Series(probabilities).fillna(.0)
if y.dtype == np.int:
self.irregularities.loc[y == 1, irregularity] = False
self.irregularities.loc[y == -1, irregularity] = True
Expand Down
30 changes: 24 additions & 6 deletions rosie/meal_price_outlier_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.base import TransformerMixin
from sklearn.cluster import KMeans

Expand Down Expand Up @@ -32,16 +33,28 @@ def transform(self, X=None):
pass

def predict(self, X):
return self.__predict(X)['y']

def predict_proba(self, X):
return np.r_[self.__predict(X)['probability']]

def __predict(self, X):
_X = X.copy()
companies = _X[self.__applicable_rows(_X)] \
.groupby('cnpj_cpf').apply(self.__company_stats) \
.reset_index()
companies['cluster'] = \
self.cluster_model.predict(companies[self.CLUSTER_KEYS])
companies = pd.merge(companies,
self.clusters[['cluster', 'threshold']],
how='left')
_X = pd.merge(_X, companies[['cnpj_cpf', 'threshold']], how='left')
self.clusters,
how='left',
on='cluster',
suffixes=['', '_cluster'])
_X = pd.merge(_X, companies, how='left', on='cnpj_cpf')
rows = self.__applicable_rows(_X)
_X['probability'] = float('nan')
_X.loc[rows, 'probability'] = \
self.__probability(_X[rows], 4, col_suffix='_cluster')
known_companies = companies[self.__applicable_company_rows(companies)]
known_thresholds = known_companies \
.groupby('cnpj_cpf') \
Expand All @@ -50,14 +63,19 @@ def predict(self, X):
.rename(columns={0: 'cnpj_threshold'})
_X = pd.merge(_X, known_thresholds, how='left')
if 'cnpj_threshold' in _X.columns:
_X.loc[_X['cnpj_threshold'].notnull(),
'threshold'] = _X['cnpj_threshold']
rows = rows & _X['cnpj_threshold'].notnull()
_X.loc[rows, 'threshold'] = _X['cnpj_threshold']
_X.loc[rows, 'probability'] = self.__probability(_X[rows])
_X['y'] = 1
is_outlier = self.__applicable_rows(_X) & \
_X['threshold'].notnull() & \
(_X['total_net_value'] > _X['threshold'])
_X.loc[is_outlier, 'y'] = -1
return _X['y']
return _X

def __probability(self, X, stds_threshold=3, col_suffix=''):
mean_col, std_col = 'mean' + col_suffix, 'std' + col_suffix
return 1 - stats.norm.pdf(stds_threshold, X[mean_col], X[std_col]) / 2

def __applicable_rows(self, X):
return (X['subquota_description'] == 'Congressperson meal') & \
Expand Down
2 changes: 1 addition & 1 deletion rosie/monthly_subquota_limit_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def predict(self, X=None):


def predict_proba(self, X=None):
return 1.
return np.repeat(1., len(X))


def __create_columns(self):
Expand Down
17 changes: 15 additions & 2 deletions rosie/traveled_speeds_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def transform(self, X=None):
pass

def predict(self, X):
return self.__predict(X)['y']

def predict_proba(self, X):
return np.r_[self.__predict(X)['probability']]

def __predict(self, X):
check_is_fitted(self, ['polynomial', '_polynomial_fn'])

_X = X.copy()
Expand All @@ -37,8 +43,8 @@ def predict(self, X):
_X = pd.merge(X, _X, how='left', left_on=self.AGG_KEYS, right_on=self.AGG_KEYS)
is_outlier = self.__applicable_rows(_X) & \
(_X['expenses_threshold_outlier'] | _X['traveled_speed_outlier'])
y = is_outlier.astype(np.int).replace({1: -1, 0: 1})
return y
_X['y'] = is_outlier.astype(np.int).replace({1: -1, 0: 1})
return _X

def __aggregate_dataset(self, X):
X = X[self.__applicable_rows(X)]
Expand All @@ -59,6 +65,13 @@ def __classify_dataset(self, X):
_X['expenses_threshold_outlier'] = _X['expenses'] > 8
threshold = self.__threshold_for_contamination(_X, self.contamination)
_X['traveled_speed_outlier'] = _X['diff_distance'] > threshold
_X['probability'] = \
(_X['distance_traveled'] - _X['expected_distance']) / \
_X['diff_distance'].max()
_X['probability'] = \
np.r_[[np.repeat(0., len(_X)),
_X['probability']]].max(axis=0)
_X.loc[_X['expenses'] > 8, 'probability'] = 1.
return _X

def __applicable_rows(self, X):
Expand Down

0 comments on commit 325b5e8

Please sign in to comment.