# Panduan ETL Data GDELT

Selamat datang di halaman panduan ETL data GDELT.

## 1. Download

Berikut adalah contoh script untuk mengunduh data GDELT dengan country code ID. Selanjutnya dari csv bisa dipindah ke mysql. Disadur dari [Github pak Guntur Budi](https://github.com/gunturbudi/digital-talent/blob/master/RETRIEVE%20GDELT%20DATA.ipynb)

In [None]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html')
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href")

# separate out those links that begin with four digits
file_list = [x for x in link_list if str.isdigit(x[0:4])]
print(file_list)

infilecounter = 0
outfilecounter = 0

import os.path
import urllib
import zipfile
import glob
import operator

local_path = os.getcwd()

fips_country_code = 'ID'

for compressed_file in file_list[infilecounter:]:
    print(compressed_file)

    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path + compressed_file):
        print('downloading, '),
        urllib.request.urlretrieve(url=gdelt_base_url + compressed_file,
                           filename=local_path + compressed_file)

    # extract the contents of the compressed file to a temporary directory
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path + compressed_file, mode='r')
    z.extractall(path=local_path + 'tmp/')

    # parse each of the csv files in the working directory,
    print('parsing,'),
    for infile_name in glob.glob(local_path + 'tmp/*'):
        outfile_name = local_path + 'gdelt_id/' + fips_country_code + '%04i.tsv' % outfilecounter

        # open the infile and outfile
        with open(infile_name, mode='r', encoding='utf-8') as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                # Try and except to pass trouble data
                try:
                    # We extract only the lines where there is relation about Indonesia (ID)
                    if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):
                        outfile.write(line)
                except:
                    pass
            outfilecounter += 1
        # delete the temporary file
        os.remove(infile_name)
    infilecounter += 1
    print('done')

## 2. Simpan ke MySQL
Agar data bisa diakses lokal dengan mudah, kita bisa menyimpannya dalam database MySQL.

### A. Bangun koneksi ke MySQL lokal
Silakan buka phpMyAdmin atau program SQL favorit anda (HeidiSQL, Laragon), lalu buat database dengan nama 'gdelt_content_id'

In [None]:
import pymysql
import pandas as pd
from sqlalchemy import create_engine

host = 'localhost'
port = '3306'
username = 'root'
password = ''
database = 'gdelt_content_id'

# Create Connection to database
engine = create_engine('mysql+pymysql://'+username+':'+password+'@'+host+':'+port+'/'+database)
'''engine = create_engine('mysql+pymysql://root: @localhost:3306/gdelt_content_id')'''

def run(sql):
    df = pd.read_sql_query(sql, engine)
    return df

### B. List Data dalam Folder
Di bagian ini, kita membaca data GDELT harian (tsv) dalam folder

In [None]:
from os import listdir
from os.path import isfile, join
folder_id = './gdelt_id/'
files = [f for f in listdir(folder_id) if isfile(join(folder_id, f))]
print(files)

In [None]:
# Open first file for sanity check
df_awal = pd.read_csv(folder_id + files[0],sep="\t")
df_awal.head()

### C. Langsung simpan ke MySQL
Untuk menyimpan data ke SQL, kita update data tabelnya untuk setiap file tsv yang ada

In [None]:
# Read header / column names
colnames = list(pd.read_excel('CSV.header.fieldids.xlsx', sheet_name='CSV.header.dailyupdates'))

for berkas in files:
    print('Extracting ' + berkas)
    df_satuan = pd.DataFrame()
    
    # Important: If your ID0000.tsv contains header, you can delete the header
    df_satuan = pd.read_csv(folder_id + berkas, sep="\t", names= colnames)
    df_satuan.to_sql(name = database, con = engine, if_exists = 'append', index = False)