## อ่านข้อมูล Original

In [13]:
import pandas as pd
import numpy as np
import random

In [14]:
url = 'https://raw.githubusercontent.com/panjaphon-kmitl/edav-cleaning/main/data_Changed_Data_Types.xlsx'
path = 'data_Changed_Data_Types.xlsx'

In [15]:
try:
    original = pd.read_excel(url)
except:
    original = pd.read_excel(path)

In [16]:
original

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,45428.552083,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,0.201389,0.784722,0.508333,0.049306,Waxing Gibbous,55
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1715849100,45428.447917,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,0.222917,0.829167,0.540278,0.093056,Waxing Gibbous,55
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,45428.406250,23.0,73.4,Sunny,...,10.4,18.4,1,1,0.236111,0.826389,0.552083,0.093056,Waxing Gibbous,55
3,Andorra,Andorra La Vella,42.50,1.52,Europe/Andorra,1715849100,45428.447917,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,0.271528,0.882639,0.591667,0.146528,Waxing Gibbous,55
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,45428.406250,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,0.258333,0.746528,0.553472,0.026389,Waxing Gibbous,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18588,Venezuela,Caracas,10.50,-66.92,America/Caracas,1724156100,45524.343750,27.2,81.0,Sunny,...,8.0,9.9,1,1,0.262500,0.779861,0.822222,0.293056,Waning Gibbous,100
18589,Vietnam,Hanoi,21.03,105.85,Asia/Bangkok,1724156100,45524.802083,32.1,89.8,Moderate or heavy rain with thunder,...,108.5,120.9,4,10,0.234028,0.765278,0.791667,0.243750,Waning Gibbous,100
18590,Yemen,Sanaa,15.35,44.21,Asia/Aden,1724156100,45524.635417,24.3,75.8,Patchy rain nearby,...,23.1,82.3,2,2,0.242361,0.765972,0.797917,0.260417,Waning Gibbous,100
18591,Zambia,Lusaka,-15.42,28.28,Africa/Lusaka,1724156100,45524.593750,26.2,79.1,Sunny,...,15.9,23.9,2,2,0.263889,0.750000,0.786806,0.285417,Waning Gibbous,100


## วางยา 1 : ทำให้ NULL

In [17]:
def create_nulls(df, column_name, p):
    num_records_to_nullify = int(len(df) * p)
    indices_to_nullify = np.random.choice(df.index, num_records_to_nullify, replace=False)
    df.loc[indices_to_nullify, column_name] = np.nan

    return df

In [18]:
poisoned = original.copy()
poisoned = create_nulls(poisoned, 'latitude', 0.12)
poisoned = create_nulls(poisoned, 'longitude', 0.05)
poisoned = create_nulls(poisoned, 'moon_illumination', 0.14)
poisoned = create_nulls(poisoned, 'country', 0.04)
poisoned = create_nulls(poisoned, 'temperature_celsius', 0.01)
poisoned = create_nulls(poisoned, 'temperature_fahrenheit', 0.02)

## วางยา 2 : สะกดคำในคอลัมน์ condition_text ผิด

In [19]:
# Dictionary of correct words and their misspelled versions
misspelled_words = {
    "Partly Cloudy": ["Partly Cluody", "Pratly Cloudy"],
    "Partly cloudy": ["Partly clody", "Pratly cloudy"],
    "Sunny": ["Sunnny"],
    "Overcast": ["Overcastt", "Overcst"],
    "Patchy rain nearby": ["Patchy rain nerby", "Patchy rain"],
    "Light rain": ["Light Rain"],
    "Mist": ["Mistt"],
    "Cloudy": ["Cloudyy"],
    "Thundery outbreaks in nearby": ["Thundery outbraks in nearby", "Thundery outbreaks", "Thunder Outbreaks", "Thunder Outbreak"],
    "Light rain shower": ["Light ran shower", "Ligt rain shower"],
}

def random_misspell(word, p):
    if random.random() <= p:
        if word in misspelled_words:
            return random.choice(misspelled_words[word])
        else:
            return word
    else:
        return word


In [20]:
poisoned_2 = poisoned.copy()
poisoned_2['condition_text'] = [random_misspell(i, 0.1) for i in poisoned_2['condition_text']]

## วางยา 3 : เปลี่ยนชื่อบางประเทศเป็นพิมพ์ผิดและตัวย่อ

In [21]:
wrong_country_names = {
    "United States of America" : ["USA"],
    "Untied Kingdom" : ["UK"],
    "Phillippines" : ["Philippines", "PH"],
    "Brazil" : ["Brasil"]
}

def misspell_country(country, p):
    if random.random() <= p:
        if country in wrong_country_names:
            return random.choice(wrong_country_names[country])
        else:
            return country
    else:
        return country


In [22]:
poisoned_3 = poisoned_2.copy()
poisoned_3['country'] = [misspell_country(i, 0.4) for i in poisoned_3['country']]

## วางยา 4 : ทำให้ Row ซ้ำ

In [23]:
def duplicate_rows(df, num_duplicates=5):
    # Randomly select rows to duplicate
    rows_to_duplicate = df.sample(n=num_duplicates, replace=True)

    # Append the duplicated rows to the original DataFrame
    df_with_duplicates = pd.concat([df, rows_to_duplicate], ignore_index=True)

    return df_with_duplicates


In [24]:
poisoned_4 = poisoned_3.copy()
poisoned_4 = duplicate_rows(poisoned_4, 7)

## ข้อมูลหลังวางยา

In [25]:
poisoned_4

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1715849100,45428.552083,26.6,79.8,Partly Cloudy,...,8.4,26.6,1,1,0.201389,0.784722,0.508333,0.049306,Waxing Gibbous,55.0
1,Albania,Tirana,,19.82,Europe/Tirane,1715849100,45428.447917,19.0,66.2,Partly cloudy,...,1.1,2.0,1,1,0.222917,0.829167,0.540278,0.093056,Waxing Gibbous,
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1715849100,45428.406250,23.0,73.4,Sunny,...,10.4,18.4,1,1,0.236111,0.826389,0.552083,0.093056,Waxing Gibbous,55.0
3,Andorra,Andorra La Vella,42.50,1.52,Europe/Andorra,1715849100,45428.447917,6.3,43.3,Light drizzle,...,0.7,0.9,1,1,0.271528,0.882639,0.591667,0.146528,Waxing Gibbous,55.0
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1715849100,45428.406250,26.0,78.8,Partly cloudy,...,183.4,262.3,5,10,0.258333,0.746528,0.553472,0.026389,Waxing Gibbous,55.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18595,Australia,Canberra,-35.28,149.22,Australia/Sydney,1716387300,45435.010417,1.0,33.8,Clear,...,2.7,5.9,1,1,0.288889,0.710417,0.664583,0.224306,Waxing Gibbous,97.0
18596,Guatemala,Guatemala City,14.62,,America/Guatemala,1717596900,45448.343750,22.0,71.6,Mistt,...,137.8,179.7,4,10,0.230556,0.770139,0.188194,0.747917,Waning Crescent,
18597,Liberia,Monrovia,6.31,-10.80,Africa/Monrovia,1721392200,45492.520833,25.5,77.9,Light rain shower,...,1.3,5.3,1,1,0.275000,0.793750,0.729167,0.186806,Waxing Gibbous,92.0
18598,Slovakia,Bratislava,48.15,17.12,Europe/Bratislava,1719063900,45465.656250,26.1,79.0,Partly cloudy,...,2.8,3.0,1,1,0.202083,0.872222,0.920139,0.185417,Full Moon,100.0
