# 1. Importing Required Modules

In [1]:
from zipfile import ZipFile 
import pandas as pd
import os

# 2. Extract data file from zip files

## 2.a. Create a function for data extraction

In [18]:
def security_data_consolidator(path_folder):

    # Listing all zip files in the directory
    securities_mf_path = path_folder
    securities_mf_files = os.listdir(securities_mf_path)

    # Create an empty text file
    securities_mf_all_filename = securities_mf_path+'.txt'
    with open(os.path.join(securities_mf_path, securities_mf_all_filename), 'a', encoding="utf-8") as securities_mf_file:
        pass

    # Create temprary folder
    mf_temp_dir = 'tmp'
    if not os.path.exists(os.path.join(securities_mf_path, mf_temp_dir)):
        os.makedirs(os.path.join(securities_mf_path, mf_temp_dir))

    # Track the file processed
    file_processed = 0

    # Iterate over all files in dataset folder. 
    # Extract the txt file and put it in the tmp folder
    # Open the txt file, append its content to the empty file previously created
    # Delete the file in tmp folder
    for file in securities_mf_files:
        # Load the zip file and create a zip object 
        with ZipFile(os.path.join(securities_mf_path, file), 'r') as mf_zip: 
    
            # Extracting all the members of the zip  
            # into a specific location. 
            mf_zip.extractall(path=os.path.join(securities_mf_path, mf_temp_dir)) 
        
        # Close zip file
        mf_zip.close()

        # Open the file in tmp folder, read the content, and delete the file once its done
        securities_mf_tmp = os.listdir(os.path.join(securities_mf_path, mf_temp_dir))
        securities_mf_txt = open(os.path.join(os.path.join(securities_mf_path, mf_temp_dir),securities_mf_tmp[0]),"r", encoding="utf-8")
        file_content = securities_mf_txt.read()
        securities_mf_txt.close()
        os.remove(os.path.join(os.path.join(securities_mf_path, mf_temp_dir),securities_mf_tmp[0]))
        
        # Only the header line on the first file needs to be added to the new file
        if file_processed==0:
            all_mf_file = open(os.path.join(securities_mf_path, securities_mf_all_filename), 'a', encoding="utf-8")
            all_mf_file.write(file_content)
            all_mf_file.close()
        else:
            file_content_list = file_content.split('\n')[1:]
            all_mf_file = open(os.path.join(securities_mf_path, securities_mf_all_filename), 'a', encoding="utf-8")
            all_mf_file.write("\n".join(file_content_list))
            all_mf_file.close()
        
        # Increase file counter
        file_processed+=1

    # Delete tmp folder
    os.removedirs(os.path.join(securities_mf_path, mf_temp_dir))
    #os.remove(path_folder+'\\tmp')

## 2.b. Extract data from securities master file 

In [19]:
security_data_consolidator('Dataset_Masterfile_Efek')

## 2.c. Extract data from securities ownership master file

In [20]:
security_data_consolidator('Dataset_Ownership_Efek')

# 3. Read the combined data

In [23]:
securities_mf = pd.read_csv('Dataset_Masterfile_Efek/Dataset_Masterfile_Efek.txt', sep='|')
securities_mf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88688 entries, 0 to 88687
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             88688 non-null  object 
 1   Code             88688 non-null  object 
 2   Description      88688 non-null  object 
 3   Type             88688 non-null  object 
 4   Isin Code        88688 non-null  object 
 5   Issuer           88688 non-null  object 
 6   Status           88688 non-null  object 
 7   Stock Exchange   73690 non-null  object 
 8   Listing Date     70570 non-null  object 
 9   Currency         88688 non-null  object 
 10  Form             88688 non-null  object 
 11  Eff. Date Isin   7280 non-null   object 
 12  Maturity Date    54746 non-null  object 
 13  Expire Date      3398 non-null   object 
 14  Exercise Price   3998 non-null   float64
 15  Interest         45825 non-null  float64
 16  Interest Type    50585 non-null  object 
 17  Interest Fre

In [24]:
securities_os = pd.read_csv('Dataset_Ownership_Efek/Dataset_Ownership_Efek.txt', sep='|')
securities_os.head()

Unnamed: 0,Date,Code,Type,Sec. Num,Price,Local IS,Local CP,Local PF,Local IB,Local ID,...,Foreign IS,Foreign CP,Foreign PF,Foreign IB,Foreign ID,Foreign MF,Foreign SC,Foreign FD,Foreign OT,Total.1
0,31-MAY-2021,AALI,EQUITY,1924688000.0,8825,114938014.0,11274440.0,19468622,49700,100544313,...,1403910,3855240,7316838,13323164,1423464,32725069,26900419,208822,6914300,94071226
1,31-MAY-2021,ABBA,EQUITY,2755125000.0,246,0.0,1692657000.0,0,0,566614547,...,0,2120900,0,14969700,1800,0,1900,0,0,17094300
2,31-MAY-2021,ABDA,EQUITY,620806700.0,7050,21885.0,47538680.0,0,98,47919272,...,0,426136555,0,97403500,400,0,0,0,62,523540517
3,31-MAY-2021,ABMM,EQUITY,2753165000.0,825,2677000.0,6965900.0,1600,0,50690300,...,0,57405300,0,320433900,406600,10351700,0,0,151862900,540460400
4,31-MAY-2021,ACES,EQUITY,17150000000.0,1495,328564522.0,99277100.0,20041945,0,272974411,...,21176900,715615629,1246731977,511051505,1561100,2235817184,196662025,4720799,941505231,5874842350


In [26]:
securities_os[securities_os['Type']!='EQUITY'].head()

Unnamed: 0,Date,Code,Type,Sec. Num,Price,Local IS,Local CP,Local PF,Local IB,Local ID,...,Foreign IS,Foreign CP,Foreign PF,Foreign IB,Foreign ID,Foreign MF,Foreign SC,Foreign FD,Foreign OT,Total.1
766,31-MAY-2021,ABSM01A,CORPORATE BOND,115000000000.0,1,750000000.0,500000000.0,0,0,15650000000,...,0,0,0,0,0,0,0,0,0,0
767,31-MAY-2021,ABSM01B,CORPORATE BOND,25000000000.0,1,1000000000.0,0.0,0,0,4000000000,...,0,0,0,0,0,0,0,0,0,0
768,31-MAY-2021,ABSM01C,CORPORATE BOND,35000000000.0,1,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
769,31-MAY-2021,ADCP01A,CORPORATE BOND,491000000000.0,1,0.0,100000000000.0,10000000000,0,0,...,0,0,0,0,0,0,0,0,0,0
770,31-MAY-2021,ADCP01B,CORPORATE BOND,9000000000.0,1,5000000000.0,0.0,2000000000,0,0,...,0,0,0,0,0,0,0,0,0,0
