In [1]:
import pdfquery as pq
import pandas as pd
import requests
import os

Scrape PDFs

In [2]:
def scrapePdf(pdfPath):
    pdf = pq.PDFQuery(pdfPath)
    pdf.load()
    pdfData = pd.DataFrame({
        "countryName": pdf.pq('LTTextLineHorizontal:overlaps_bbox("138.98, 686.86, 211.513, 697.9")').text(),
        "EPRETRSectorCode": pdf.pq('LTTextLineHorizontal:overlaps_bbox("180.26, 643.3, 185.857, 654.34")').text(),
        "eptrSectorName":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("210.29, 643.3, 471.936, 654.34")').text().split(" ", 1)[1],
        "EPRTRAnnexIMainActivityCode":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("52.8, 614.26, 157.097, 625.3")').text().split(" ", 1)[1],
        "FacilityInspireID":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("52.8, 715.9, 266.172, 726.94")').text().split(" ", 1)[1],
        "facilityName":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("52.8, 730.42, 343.076, 741.46")').text().split(" ", 2)[2],
        "City": pdf.pq('LTTextLineHorizontal:overlaps_bbox("138.98, 672.34, 221.195, 683.38")').text(),
        "CITY ID": pdf.pq('LTTextLineHorizontal:overlaps_bbox("138.98, 175.7, 312.518, 186.74")').text(),
        "targetRelease": pdf.pq('LTTextLineHorizontal:overlaps_bbox("138.98, 570.67, 154.094, 581.71")').text(),
        "pollutant": pdf.pq('LTTextLineHorizontal:overlaps_bbox("306.17, 570.67, 406.325, 581.71")').text(),
        "DAY": pdf.pq('LTTextLineHorizontal:overlaps_bbox("174.62, 527.11, 185.857, 538.15")').text(),
        "MONTH": pdf.pq('LTTextLineHorizontal:overlaps_bbox("347.47, 527.11, 384.086, 538.15")').text().split(" ", 1)[
            0],
        "reportingYear": pdf.pq('LTTextLineHorizontal:overlaps_bbox("461.38, 527.11, 483.897, 538.15")').text(),
        "CONTINENT": pdf.pq('LTTextLineHorizontal:overlaps_bbox("437.02, 686.86, 473.96, 697.9")').text(),
        "max_wind_speed":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("52.8, 452.11, 185.806, 463.15")').text().split(" ", 1)[1],
        "avg_wind_speed":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("316.87, 452.11, 483.846, 463.15")').text().split(" ", 2)[2],
        "min_wind_speed":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("316.87, 452.11, 483.846, 463.15")').text().split(" ", 1)[0],
        "max_temp": pdf.pq('LTTextLineHorizontal:overlaps_bbox("144.02, 388.01, 185.806, 399.05")').text(),
        "avg_temp": pdf.pq('LTTextLineHorizontal:overlaps_bbox("442.06, 388.01, 483.846, 399.05")').text(),
        "min_temp":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("311.23, 388.01, 405.714, 399.05")').text().split(" ", 1)[0],
        "DAYS WITH FOG": pdf.pq('LTTextLineHorizontal:overlaps_bbox("174.62, 323.93, 185.857, 334.97")').text(),
        "REPORTER NAME":
            pdf.pq('LTTextLineHorizontal:overlaps_bbox("356.95, 233.81, 504.706, 244.85")').text().split(":", 1)[1]
    }, index=[0])
    
    pdfData.rename(columns={'DAYS WITH FOG': 'DAY WITH FOGS'}, inplace=True)
    pdfData.rename(columns={'eptrSectorName': 'eprtrSectorName'}, inplace=True)
    return pdfData


def getPdfNames():
    pdfsPath = './pdfs/'
    files = [f for f in os.listdir(pdfsPath) if os.path.isfile(os.path.join(pdfsPath, f))]
    for file in files:
        if file.split(".")[1] != 'pdf':
            files.remove(file)
    files = map(lambda file: pdfsPath + file, files)
    return files

pdfData = pd.DataFrame()
files = getPdfNames()
for file in files:
    pdfData = pd.concat([pdfData, scrapePdf(file)], axis=0)

Get JSONs

In [3]:
def jsonToDataframe(url):
    print(url)
    resp = requests.get(url=url)
    data = resp.json()
    df = pd.DataFrame()
    for count, row in enumerate(data):
        entry = pd.DataFrame(data[count], index=[count])
        df = pd.concat([df, entry], axis='rows')
    return df


urls = ['http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/first',
        'http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/second',
        'http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/third']
jsonData = pd.DataFrame()
for url in urls:
    temp = jsonToDataframe(url)
    jsonData = pd.concat([jsonData, temp], axis='rows')
#jsonData.drop(columns=['EPRTRAnnexIMainActivityLabel'], inplace=True)
jsonData.drop(columns=['EPRETRSectorCode'], inplace=True)
jsonData.drop(columns=[''], inplace=True)

http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/first
http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/second
http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/third


Get Csvs

In [5]:
def getcode(label, dictionary):
    # based on data from here: https://iir.umweltbundesamt.de/2021/general/point_sources/start
    if label == 'Chemical installations for the production on an industrial scale of basic organic chemicals: Organometallic compounds':
        return '4(a)(vii)'
    return dictionary[label]

df1 = pd.read_csv("./csvs/train1.csv")
df2 = pd.read_csv("./csvs/train2.csv", sep=';')
frames = [df1,df2]
csvData = pd.concat(frames)

names_list = csvData.columns.to_list()

aux = 0
for element in names_list:
    if csvData[element].isnull().values.any():
        aux = aux + 1
        print(csvData[element].isnull().values.any())
if aux > 0:
    print('existing nulls')
else:
    print('no existing nulls')
csvData['EPRTRAnnexIMainActivityCode'] = csvData.apply(lambda row: getcode(row['EPRTRAnnexIMainActivityLabel'], dict(zip(jsonData['EPRTRAnnexIMainActivityLabel'], jsonData['EPRTRAnnexIMainActivityCode']))), axis=1)
csvData.drop(columns=['EPRTRAnnexIMainActivityLabel'], inplace=True)
jsonData.drop(columns=['EPRTRAnnexIMainActivityLabel'], inplace=True)
#jsonData.drop(columns=['Unnamed: 0'], inplace=True)

no existing nulls


Unify data

In [26]:
data = pd.read_csv('dataset.csv')#pd.concat([jsonData, csvData, pdfData], axis='rows')

"""
data.drop(columns=['FacilityInspireID'], inplace=True)
data.drop(columns=['facilityName'], inplace=True)
data.drop(columns=['targetRelease'], inplace=True)
data.drop(columns=['CONTINENT'], inplace=True)
data.drop(columns=['REPORTER NAME'], inplace=True)
data.drop(columns=['CITY ID'], inplace=True)
data.drop(columns=['EPRETRSectorCode'], inplace=True)"""
data.drop(columns=['Unnamed: 0'], inplace=True)

We swap min and max values for wind and temperature when min values are greater than max values

In [27]:
data.loc[data['min_wind_speed'] > data['max_wind_speed'], ['min_wind_speed','max_wind_speed']] = data.loc[data['min_wind_speed'] > data['max_wind_speed'], ['max_wind_speed','min_wind_speed']].values
data.loc[data['min_temp'] > data['max_temp'], ['min_temp','max_temp']] = data.loc[data['min_temp'] > data['max_temp'], ['max_temp','min_temp']].values


Transformations:

Replace max, min and average temperature and winds for pdf values because they don't have sense

In [28]:
head = data.head(-82)
uk = head.loc[head['countryName'] == 'United Kingdom']
uk_max_wind = uk['max_wind_speed'].astype(float).mean()
uk_min_wind = uk['min_wind_speed'].astype(float).mean()
uk_avg_wind = uk['avg_wind_speed'].astype(float).mean()
uk_max_temp = uk['max_temp'].astype(float).mean()
uk_min_temp = uk['min_temp'].astype(float).mean()
uk_avg_temp = uk['avg_temp'].astype(float).mean()
tail = data.tail(82)
tail = tail.assign(max_wind_speed=uk_max_wind)
tail = tail.assign(min_wind_speed=uk_min_wind)
tail = tail.assign(avg_wind_speed=uk_avg_wind)
tail = tail.assign(max_temp=uk_max_temp)
tail = tail.assign(min_temp=uk_min_temp)
tail = tail.assign(avg_temp=uk_avg_temp)

data = pd.concat([head, tail], axis='rows')

In [29]:
data.to_csv('dataset.csv')
print("hola caracola")

hola caracola


In [30]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from numpy import std

Encoding:

In [35]:
le_general = LabelEncoder()
le_pollutant = LabelEncoder()
data.drop(columns=data.columns[0], inplace=True)
data['pollutant'] = le_pollutant.fit_transform(data['pollutant'])
data['countryName'] = le_general.fit_transform(data['countryName'])
data['eprtrSectorName'] = le_general.fit_transform(data['eprtrSectorName'])
data['City'] = le_general.fit_transform(data['City'])
data['EPRTRAnnexIMainActivityCode'] = le_general.fit_transform(data['EPRTRAnnexIMainActivityCode'])
features = data.columns.tolist()
features.remove('pollutant')
X = data[features]
y = data['pollutant']
print(features)

['City', 'DAY', 'DAY WITH FOGS', 'EPRTRAnnexIMainActivityCode', 'MONTH', 'avg_temp', 'avg_wind_speed', 'countryName', 'eprtrSectorName', 'max_temp', 'max_wind_speed', 'min_temp', 'min_wind_speed', 'reportingYear']


Creation of test and training dataset

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

KNN Model training

In [37]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training sets
model.fit(X_train, y_train)

#Predict Output
y_pred = model.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred)
print(conf_mat)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average='micro'))
print(metrics.recall_score(y_test, y_pred, average='micro'))
print(metrics.f1_score(y_test, y_pred, average='micro'))

[[2451  419 1748]
 [ 477 2548  325]
 [2007  518 2649]]
0.5819509968041394
0.5819509968041394
0.5819509968041394
0.5819509968041394


Hablamos de las metricas

In [39]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred)
print(conf_mat)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average='micro'))
print(metrics.recall_score(y_test, y_pred, average='micro'))
print(metrics.f1_score(y_test, y_pred, average='micro'))

[[ 127  915 3576]
 [  22 2660  668]
 [ 165  982 4027]]
0.5184903363262822
0.5184903363262822
0.5184903363262822
0.5184903363262822


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred)
print(conf_mat)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average='micro'))
print(metrics.recall_score(y_test, y_pred, average='micro'))
print(metrics.f1_score(y_test, y_pred, average='micro'))


[[ 645  977 2996]
 [  61 2521  768]
 [ 664 1043 3467]]
0.5047176989803683
0.5047176989803683
0.5047176989803683
0.5047176989803683


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred)
print(conf_mat)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average='micro'))
print(metrics.recall_score(y_test, y_pred, average='micro'))
print(metrics.f1_score(y_test, y_pred, average='micro'))

feature_imp = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False)
print(feature_imp.to_string())

FP_0 = conf_mat[1][0] + conf_mat[2][0]
FN_0 = conf_mat[0][1] + conf_mat[0][2]
FP_1 = conf_mat[0][1] + conf_mat[2][1]
FN_1 = conf_mat[1][0] + conf_mat[1][2]
FP_2 = conf_mat[0][2] + conf_mat[1][2]
FN_2 = conf_mat[2][0] + conf_mat[2][1]

FP = FP_0 + FP_1 + FP_2
FN = FN_0 + FN_1 + FN_2
print(FP)
print(FN)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(clf, data.drop('pollutant', axis=1), data['pollutant'], scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


[[2596   61 1961]
 [ 180 2913  257]
 [1572  119 3483]]
0.6842185359914777
0.6842185359914777
0.6842185359914777
0.6842185359914777
EPRTRAnnexIMainActivityCode    0.217310
eprtrSectorName                0.111532
City                           0.068943
min_wind_speed                 0.068760
max_wind_speed                 0.068740
avg_wind_speed                 0.067658
max_temp                       0.065634
min_temp                       0.065375
avg_temp                       0.064354
DAY                            0.051975
countryName                    0.045066
reportingYear                  0.041033
MONTH                          0.037565
DAY WITH FOGS                  0.026054
4150
4150
