In [1]:
import pandas as pd
import requests
import os

### Download files from the GitHub repository for analysis

In [2]:
#Base URL of repository used for download
url_dwl = 'https://github.com/FranciscoFoz/7_Days_of_Code_Alura-Python-Pandas/raw/main/Dia_1-Importando_dados/Datasets/'
folder_name = 'dados_emprestimos'


In [3]:
#Creating directory to save CSV files
if not os.path.exists('Dados/Emprestimos/'):
    os.makedirs('Dados/Emprestimos/')
    print('Directory created successfully')
else:
    print('existing directory')



existing directory


In [4]:
#loop to download each file found in the repository
for i in range(20101, 20202):
    file_name = f'emprestimos-{i}.csv'
    file_path = f'Dados/Emprestimos/{file_name}'
    url = f'{url_dwl}/{folder_name}/{file_name}'
    
    if os.path.exists(file_path):
        print(f'{file_name} already downloaded.')
    else:
        response = requests.get(url)
        
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f'{file_name} successfully downloaded!')
        else:
            print(f'{file_name} not found in repository')

emprestimos-20101.csv already downloaded.
emprestimos-20102.csv already downloaded.
emprestimos-20103.csv not found in repository
emprestimos-20104.csv not found in repository
emprestimos-20105.csv not found in repository
emprestimos-20106.csv not found in repository
emprestimos-20107.csv not found in repository
emprestimos-20108.csv not found in repository
emprestimos-20109.csv not found in repository
emprestimos-20110.csv not found in repository
emprestimos-20111.csv already downloaded.
emprestimos-20112.csv already downloaded.
emprestimos-20113.csv not found in repository
emprestimos-20114.csv not found in repository
emprestimos-20115.csv not found in repository
emprestimos-20116.csv not found in repository
emprestimos-20117.csv not found in repository
emprestimos-20118.csv not found in repository
emprestimos-20119.csv not found in repository
emprestimos-20120.csv not found in repository
emprestimos-20121.csv already downloaded.
emprestimos-20122.csv already downloaded.
emprestimos-

### Reading CSV Files and Concatenating to a Single DataFrame

In [5]:
#Creating a looping to read all csv files from the directory and concatenate in just one DataFrame
dfs = []
for i in range(2010, 2021):
    df1 = pd.read_csv(f'Dados/Emprestimos/emprestimos-{i}1.csv')
    dfs.append(df1)
    try:
        df2 = pd.read_csv(f'Dados/Emprestimos/emprestimos-{i}2.csv')
        dfs.append(df2)
    except FileNotFoundError:
        pass
emprestimos = pd.concat(dfs, ignore_index=True)
emprestimos



Unnamed: 0,id_emprestimo,codigo_barras,data_renovacao,data_emprestimo,data_devolucao,matricula_ou_siape,tipo_vinculo_usuario
0,709684,L095049,,2010/01/04 07:44:10.721000000,2010/01/05 16:26:12.662000000,2.008023e+09,ALUNO DE GRADUAÇÃO
1,709685,L167050,,2010/01/04 07:44:10.750000000,2010/01/12 07:34:13.934000000,2.008023e+09,ALUNO DE GRADUAÇÃO
2,709686,2006017618,2010/01/26 08:07:01.738000000,2010/01/04 08:08:44.081000000,2010/02/25 07:36:25.800000000,2.008112e+09,ALUNO DE PÓS-GRADUAÇÃO
3,709687,L184117,2010/01/18 11:07:46.470000000,2010/01/04 08:24:21.284000000,2010/02/03 08:58:45.692000000,2.007211e+08,ALUNO DE GRADUAÇÃO
4,709684,L095049,,2010/01/04 07:44:10.721000000,2010/01/05 16:26:12.662000000,2.008023e+09,ALUNO DE GRADUAÇÃO
...,...,...,...,...,...,...,...
2258013,2989086,2008047044,,2020/05/05 19:51:06.059000000,2021/05/13 14:53:31.000000000,3.067431e+06,SERVIDOR TÉCNICO-ADMINISTRATIVO
2258014,2989087,2008047047,,2020/05/05 19:51:06.135000000,2021/05/13 14:53:38.000000000,3.067431e+06,SERVIDOR TÉCNICO-ADMINISTRATIVO
2258015,2989088,2010052905,,2020/05/05 19:51:06.164000000,2021/05/13 15:01:46.000000000,3.067431e+06,SERVIDOR TÉCNICO-ADMINISTRATIVO
2258016,2989089,2010048023,,2020/06/09 11:54:02.669000000,2021/07/21 15:33:55.000000000,2.020101e+10,ALUNO DE PÓS-GRADUAÇÃO


### Check and Remove Null Values and Duplicates in DataFrame

In [6]:
#check null values
print(f'Total null values ​​in DataFrame:\n{emprestimos.isnull().sum()}')


Total null values ​​in DataFrame:
id_emprestimo                 0
codigo_barras                 0
data_renovacao          1285720
data_emprestimo               0
data_devolucao             6471
matricula_ou_siape         3170
tipo_vinculo_usuario          0
dtype: int64


In [7]:
#check duplicates
print(f'Total duplicates in DataFrame: {emprestimos.duplicated().sum()}')

Total duplicates in DataFrame: 37


In [8]:
#Removing Null Values and Duplicates in DataFrame
emprestimos.drop_duplicates(inplace=True)
emprestimos.dropna(inplace=True)

In [9]:
#Checking if there are still duplicates or null values ​​in the DataFrame
duplicates = emprestimos.duplicated().sum()
null_values = emprestimos.isnull().sum().sum()
print(f'Total duplicates: {duplicates}')
print(f'Total null values: {null_values}')

Total duplicates: 0
Total null values: 0


### Importing copies of the collection

In [10]:
#Importing copy
exemplares = pd.read_parquet('Dados/Exemplar/dados_exemplares.parquet')
exemplares

Unnamed: 0_level_0,id_exemplar,codigo_barras,colecao,biblioteca,status_material,localizacao,registro_sistema
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,5,L000003,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,694,1
1,4,L000002,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,688,1
2,3,L000001,Acervo Circulante,Biblioteca Central Zila Mamede,ESPECIAL,638,1
3,7,L000114,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,616,5
5,10,L000041,Acervo Circulante,Biblioteca Central Zila Mamede,ESPECIAL,657,15
...,...,...,...,...,...,...,...
568580,1353472,2021011150,Acervo Circulante,Biblioteca Setorial do Núcleo de Educação da I...,REGULAR,951,268231
568581,1353473,2019013454,Acervo Circulante,Biblioteca Central Zila Mamede,ESPECIAL,997,268233
568582,1353474,2019012811,Acervo Circulante,Biblioteca Central Zila Mamede,ESPECIAL,987,268234
568583,1353475,2019013451,Acervo Circulante,Biblioteca Central Zila Mamede,ESPECIAL,764,268235


### Join Exemplar to the treated Frame of the loan variable

In [11]:
loans_new = emprestimos.merge(exemplares)
loans_new

Unnamed: 0,id_emprestimo,codigo_barras,data_renovacao,data_emprestimo,data_devolucao,matricula_ou_siape,tipo_vinculo_usuario,id_exemplar,colecao,biblioteca,status_material,localizacao,registro_sistema
0,709686,2006017618,2010/01/26 08:07:01.738000000,2010/01/04 08:08:44.081000000,2010/02/25 07:36:25.800000000,2.008112e+09,ALUNO DE PÓS-GRADUAÇÃO,195347,Acervo Circulante,Biblioteca Setorial Prof. Rodolfo Helinski - E...,REGULAR,640,75019
1,749227,2006017618,2010/04/19 17:35:25.987000000,2010/04/05 11:25:57.627000000,2010/04/27 11:37:02.530000000,2.011105e+09,ALUNO DE PÓS-GRADUAÇÃO,195347,Acervo Circulante,Biblioteca Setorial Prof. Rodolfo Helinski - E...,REGULAR,640,75019
2,1175444,2006017618,2012/03/11 17:05:30.982000000,2012/02/24 16:18:40.959000000,2012/03/28 14:32:13.000000000,2.010057e+09,ALUNO DE GRADUAÇÃO,195347,Acervo Circulante,Biblioteca Setorial Prof. Rodolfo Helinski - E...,REGULAR,640,75019
3,1209694,2006017618,2012/04/11 12:34:59.458000000,2012/03/28 14:33:04.249000000,2012/04/25 11:48:28.000000000,2.010057e+09,ALUNO DE GRADUAÇÃO,195347,Acervo Circulante,Biblioteca Setorial Prof. Rodolfo Helinski - E...,REGULAR,640,75019
4,1374962,2006017618,2012/12/12 07:40:40.406000000,2012/11/14 12:41:25.081000000,2013/01/15 17:10:17.000000000,2.011104e+09,ALUNO DE PÓS-GRADUAÇÃO,195347,Acervo Circulante,Biblioteca Setorial Prof. Rodolfo Helinski - E...,REGULAR,640,75019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
906407,2988812,2016019436,2020/04/26 11:43:40.062000000,2020/03/17 16:20:01.992000000,2021/06/09 12:33:39.000000000,2.016103e+10,ALUNO DE PÓS-GRADUAÇÃO,1235460,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,511,233523
906408,2988874,2010080699,2021/08/10 02:30:14.871000000,2020/03/17 17:07:40.834000000,2022/02/17 11:48:53.000000000,2.017015e+10,ALUNO DE GRADUAÇÃO,826353,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,660,134891
906409,2988922,2018031109,2020/03/29 10:17:16.672000000,2020/03/17 17:45:40.794000000,2021/03/25 10:22:11.000000000,2.018013e+10,ALUNO DE GRADUAÇÃO,1342486,Acervo Circulante,Biblioteca Central Zila Mamede,REGULAR,717,264237
906410,2988968,2016033194,2020/04/06 18:21:57.168000000,2020/03/17 18:33:29.752000000,2021/06/15 16:52:35.000000000,2.016014e+10,ALUNO DE GRADUAÇÃO,1310816,Acervo Circulante,Biblioteca Setorial da Faculdade de Ciências d...,REGULAR,785,253484
