In [1]:
import polars as pl

### 

In [2]:
"""
Polars expects UTF-8 text by default. The dataset file contains invalid UTF-8 characters.
We use:
    "utf8-lossy":
        - Tries to decode as UTF-8
        - Replaces invalid characters instead of crashing
    
    "ignore_errors":
        This skips bad rows instead of failing the entire load. It is useful when:
            - A row has too many/few columns
            - A value can’t be parsed into the inferred type
            - There are malformed quotes or delimiters

        Tradeoff is some rows maybe dropped silently.
    
    "low_memory":
        What it does:
            Reduces memory usage during CSV parsing
            Polars processes the file in smaller chunks
    
        Why it matters
            The dataset is large with hundreds of thousands of rows and many text columns
    
        This flag:
            - Uses less RAM
            - Slightly slower
            - Safer on laptops / limited-memory machines
"""

df = pl.read_csv(
    'data/global-terrorism.csv', 
    encoding="utf8-lossy", 
    ignore_errors=True,
    low_memory=True,
    row_index_name="ID",
    row_index_offset=1,
)

### Count the first rows in the dataset

In [3]:
df.head()

ID,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,region_txt,provstate,city,latitude,longitude,specificity,vicinity,location,summary,crit1,crit2,crit3,doubtterr,alternative,alternative_txt,multiple,success,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,…,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,propextent,propextent_txt,propvalue,propcomment,ishostkid,nhostkid,nhostkidus,nhours,ndays,divert,kidhijcountry,ransom,ransomamt,ransomamtus,ransompaid,ransompaidus,ransomnote,hostkidoutcome,hostkidoutcome_txt,nreleased,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
u32,i64,i64,i64,i64,str,i64,str,i64,str,i64,str,str,str,f64,f64,i64,i64,str,str,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,str,i64,str,str,str,i64,str,…,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,str,i64,i64,i64,str,str,str,str,i64,i64,str,i64,str,str,i64,str,i64,str,str,str,str,str,i64,i64,i64,i64,str
1,197000000001,1970,7,2,,0,,58,"""Dominican Republic""",2,"""Central America & Caribbean""",,"""Santo Domingo""",18.456792,-69.951164,1,0,,,1,1,1,0,,,0,1,0,1,"""Assassination""",,,,,14,"""Private Citizens & Property""",…,1.0,,,0.0,,,0,,,,,0,,,,,,,0,,,,,,,,,,,,,"""PGIS""",0,0,0,0,
2,197000000002,1970,0,0,,0,,130,"""Mexico""",1,"""North America""","""Federal""","""Mexico city""",19.371887,-99.086624,1,0,,,1,1,1,0,,,0,1,0,6,"""Hostage Taking (Kidnapping)""",,,,,7,"""Government (Diplomatic)""",…,0.0,,,0.0,,,0,,,,,1,1.0,0.0,,,,"""Mexico""",1,800000.0,,,,,,,,,,,,"""PGIS""",0,1,1,1,
3,197001000001,1970,1,0,,0,,160,"""Philippines""",5,"""Southeast Asia""","""Tarlac""","""Unknown""",15.478598,120.599741,4,0,,,1,1,1,0,,,0,1,0,1,"""Assassination""",,,,,10,"""Journalists & Media""",…,1.0,,,0.0,,,0,,,,,0,,,,,,,0,,,,,,,,,,,,,"""PGIS""",-9,-9,1,1,
4,197001000002,1970,1,0,,0,,78,"""Greece""",8,"""Western Europe""","""Attica""","""Athens""",37.99749,23.762728,1,0,,,1,1,1,0,,,0,1,0,3,"""Bombing/Explosion""",,,,,7,"""Government (Diplomatic)""",…,,,,,,,1,,,,,0,,,,,,,0,,,,,,,,,,,,,"""PGIS""",-9,-9,1,1,
5,197001000003,1970,1,0,,0,,101,"""Japan""",4,"""East Asia""","""Fukouka""","""Fukouka""",33.580412,130.396361,1,0,,,1,1,1,-9,,,0,1,0,7,"""Facility/Infrastructure Attack""",,,,,7,"""Government (Diplomatic)""",…,,,,,,,1,,,,,0,,,,,,,0,,,,,,,,,,,,,"""PGIS""",-9,-9,1,1,


### Last rows in the dataset

In [4]:
df.tail()

ID,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,region_txt,provstate,city,latitude,longitude,specificity,vicinity,location,summary,crit1,crit2,crit3,doubtterr,alternative,alternative_txt,multiple,success,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,…,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,propextent,propextent_txt,propvalue,propcomment,ishostkid,nhostkid,nhostkidus,nhours,ndays,divert,kidhijcountry,ransom,ransomamt,ransomamtus,ransompaid,ransompaidus,ransomnote,hostkidoutcome,hostkidoutcome_txt,nreleased,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
u32,i64,i64,i64,i64,str,i64,str,i64,str,i64,str,str,str,f64,f64,i64,i64,str,str,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,str,i64,str,str,str,i64,str,…,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,str,i64,i64,i64,str,str,str,str,i64,i64,str,i64,str,str,i64,str,i64,str,str,str,str,str,i64,i64,i64,i64,str
181687,201712310022,2017,12,31,,0,,182,"""Somalia""",11,"""Sub-Saharan Africa""","""Middle Shebelle""","""Ceelka Geelow""",2.359673,45.385034,2,0,"""The incident occurred near the…","""12/31/2017: Assailants opened …",1,1,0,1,1.0,"""Insurgency/Guerilla Action""",0,1,0,2,"""Armed Assault""",,,,,4,"""Military""",…,1,0,0,2,0,0,-9,,,,,0,,,,,,,,,,,,,,,,,"""""Somalia: Al-Shabaab Militants…","""""Highlights: Somalia Daily Med…","""""Highlights: Somalia Daily Med…","""START Primary Collection""",0,0,0,0,
181688,201712310029,2017,12,31,,0,,200,"""Syria""",10,"""Middle East & North Africa""","""Lattakia""","""Jableh""",35.407278,35.942679,1,1,"""The incident occurred at the H…","""12/31/2017: Assailants launche…",1,1,0,1,1.0,"""Insurgency/Guerilla Action""",0,1,0,3,"""Bombing/Explosion""",,,,,4,"""Military""",…,2,0,0,7,0,0,1,4.0,"""Unknown""",-99.0,"""Seven military planes were dam…",0,,,,,,,,,,,,,,,,,"""""Putin's 'victory' in Syria ha…","""""Two Russian soldiers killed a…","""""Two Russian servicemen killed…","""START Primary Collection""",-9,-9,1,1,
181689,201712310030,2017,12,31,,0,,160,"""Philippines""",5,"""Southeast Asia""","""Maguindanao""","""Kubentog""",6.900742,124.437908,2,0,"""The incident occurred in the D…","""12/31/2017: Assailants set fir…",1,1,1,0,,,0,1,0,7,"""Facility/Infrastructure Attack""",,,,,14,"""Private Citizens & Property""",…,0,0,0,0,0,0,1,4.0,"""Unknown""",-99.0,"""Houses were damaged in this at…",0,,,,,,,,,,,,,,,,,"""""Maguindanao clashes trap trib…",,,"""START Primary Collection""",0,0,0,0,
181690,201712310031,2017,12,31,,0,,92,"""India""",6,"""South Asia""","""Manipur""","""Imphal""",24.798346,93.94043,1,0,"""The incident occurred in the M…","""12/31/2017: Assailants threw a…",1,1,1,0,,,0,0,0,3,"""Bombing/Explosion""",,,,,2,"""Government (General)""",…,0,0,0,0,0,0,-9,,,,,0,,,,,,,,,,,,,,,,,"""""Trader escapes grenade attack…",,,"""START Primary Collection""",-9,-9,0,-9,
181691,201712310032,2017,12,31,,0,,160,"""Philippines""",5,"""Southeast Asia""","""Maguindanao""","""Cotabato City""",7.209594,124.241966,1,0,,"""12/31/2017: An explosive devic…",1,1,1,0,,,0,0,0,3,"""Bombing/Explosion""",,,,,20,"""Unknown""",…,0,0,0,0,0,0,0,,,,,0,,,,,,,,,,,,,,,,,"""""Security tightened in Cotabat…","""""Security tightened in Cotabat…",,"""START Primary Collection""",-9,-9,0,-9,


### Count total rows and columns in the dataset

In [5]:
df.shape

(181691, 136)

In [6]:
df.null_count().sum()

ID,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,region_txt,provstate,city,latitude,longitude,specificity,vicinity,location,summary,crit1,crit2,crit3,doubtterr,alternative,alternative_txt,multiple,success,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,…,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,propextent,propextent_txt,propvalue,propcomment,ishostkid,nhostkid,nhostkidus,nhours,ndays,divert,kidhijcountry,ransom,ransomamt,ransomamtus,ransompaid,ransompaidus,ransomnote,hostkidoutcome,hostkidoutcome_txt,nreleased,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,172452,0,179471,0,0,0,0,421,434,4556,4557,6,0,126196,66129,0,0,0,1,152680,152680,1,0,0,0,0,175377,175377,181263,181263,0,0,…,10313,64446,66958,16313,64702,69143,0,117626,117626,142828,123732,178,168119,168174,177628,173567,181367,178386,104310,180441,181128,180950,181139,181177,170700,170700,171291,153402,66191,104758,138175,0,0,0,0,0,156653


# Data cleaning

### Data filtering

In [23]:
query = df.filter(pl.col("suicide") == 3).shape
query

(0, 136)

### Drop null values


In [None]:
df = df.drop_nulls(["longitude", "latitude"])

### Confirm if null values have been dropped succesfully

In [None]:
df.null_count()

In [None]:
selected_data = df.select([
    "iyear",
    "imonth",
    "country_txt",
    "region_txt",
    "provstate",
    "latitude",
    "longitude",
    
]).drop_nulls([
    "longitude",
    "latitude",
    "provstate",
])

selected_data.null_count()