In [12]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Combined_DS_v10.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   SurveyDate                  999 non-null    object 
 1   FormalEducation             999 non-null    object 
 2   ConvertedSalary             665 non-null    float64
 3   Hobby                       999 non-null    object 
 4   Country                     999 non-null    object 
 5   StackOverflowJobsRecommend  487 non-null    float64
 6   VersionControl              999 non-null    object 
 7   Age                         999 non-null    int64  
 8   Years Experience            999 non-null    int64  
 9   Gender                      693 non-null    object 
 10  RawSalary                   665 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 86.0+ KB


In [3]:
#選取特定資料類型的資料
numeric_df = df.select_dtypes(include = ['int64','float'])
print(numeric_df.columns)

Index(['ConvertedSalary', 'StackOverflowJobsRecommend', 'Age',
       'Years Experience'],
      dtype='object')


In [5]:
#簡單處理類別型資料
dummy = pd.get_dummies(df, columns=['Country'], drop_first=True, prefix='DM')
print(dummy.columns)

Index(['SurveyDate', 'FormalEducation', 'ConvertedSalary', 'Hobby',
       'StackOverflowJobsRecommend', 'VersionControl', 'Age',
       'Years Experience', 'Gender', 'RawSalary', 'DM_India', 'DM_Ireland',
       'DM_Russia', 'DM_South Africa', 'DM_Spain', 'DM_Sweeden', 'DM_UK',
       'DM_USA', 'DM_Ukraine'],
      dtype='object')


In [7]:
#透過門檻篩選資料
countries = df['Country']

country_counts = countries.value_counts()

mask = countries.isin(country_counts[country_counts < 10].index)

print(mask.head())

countries[mask] = 'Other'

print(countries.value_counts())

0    False
1    False
2    False
3    False
4    False
Name: Country, dtype: bool
South Africa    166
USA             164
Spain           134
Sweeden         119
France          115
Russia           97
UK               95
India            95
Other            14
Name: Country, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  countries[mask] = 'Other'


In [13]:
#自行建立資料區段來切分資料
df['Paid_Job'] = 0

df.loc[df.ConvertedSalary > 0, 'Paid_Job'] = 1

print(df[['Paid_Job', 'ConvertedSalary']].head())

df['equal_binned'] = pd.cut(df['ConvertedSalary'], bins = 5)

print(df[['equal_binned', 'ConvertedSalary']].head())


# Specify the boundaries of the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

# Bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

# Bin the continuous variable ConvertedSalary using these boundaries
df['boundary_binned'] = pd.cut(df['ConvertedSalary'], 
                                         bins = bins, labels = labels)

# Print the first 5 rows of the boundary_binned column
print(df[['boundary_binned', 'ConvertedSalary']].head())

   Paid_Job  ConvertedSalary
0         0              NaN
1         1          70841.0
2         0              NaN
3         1          21426.0
4         1          41671.0
          equal_binned  ConvertedSalary
0                  NaN              NaN
1  (-2000.0, 400000.0]          70841.0
2                  NaN              NaN
3  (-2000.0, 400000.0]          21426.0
4  (-2000.0, 400000.0]          41671.0
  boundary_binned  ConvertedSalary
0             NaN              NaN
1          Medium          70841.0
2             NaN              NaN
3             Low          21426.0
4             Low          41671.0
