In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
titanic_df = sns.load_dataset('titanic')
taxis_df = sns.load_dataset('taxis')

In [3]:
# 1
titanic_df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [4]:
# 2
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
titanic_df['survived'].dtype

dtype('int64')

In [6]:
taxis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [7]:
# Verifying numeric columns - to check basic statistics
titanic_df['age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

In [8]:
sample_df = pd.DataFrame({
    'Name': ['Alice Smith', 'Bob Johnson', 'Charlie Lee', 'David Brown', 'Eva White', 'Frank Black', 'Grace Green'],
    'Age': [28, 34, 'Unknown', 42, 'Unknown', 36, 30],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio'],
    'Salary': [70000, 80000, 65000, 120000, 50000, 90000, 75000]
    },
    index=np.arange(1, 8))
sample_df

Unnamed: 0,Name,Age,City,Salary
1,Alice Smith,28,New York,70000
2,Bob Johnson,34,Los Angeles,80000
3,Charlie Lee,Unknown,Chicago,65000
4,David Brown,42,Houston,120000
5,Eva White,Unknown,Phoenix,50000
6,Frank Black,36,Philadelphia,90000
7,Grace Green,30,San Antonio,75000


In [9]:
sample_df['Age'].where(sample_df['Age'] == 'Unknown', np.nan)

1        NaN
2        NaN
3    Unknown
4        NaN
5    Unknown
6        NaN
7        NaN
Name: Age, dtype: object

In [10]:
sample_df['Age'].where(sample_df['Age'] != 'Unknown', np.nan)

1     28
2     34
3    NaN
4     42
5    NaN
6     36
7     30
Name: Age, dtype: object

In [11]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [12]:
mixed_dtypes = {}
for column in titanic_df.columns:
    types = titanic_df[column].apply(lambda x: type(x).__name__).value_counts()
    if len(types)> 1:
        mixed_dtypes.update(types)

In [13]:
mixed_dtypes

{'str': np.int64(889), 'float': np.int64(2)}

In [14]:
def mixed_dtypes_detector_1(df):
    mixed_dtypes = {}
    for column in df.columns:
        types = df[column].apply(lambda x: type(x).__name__).value_counts()
        if len(types)> 1:
            mixed_dtypes.update(types)
    return mixed_dtypes

def mixed_dtypes_detector_2(df):
    mixed_types = {}
    for column in df.columns:
        types = df[column].apply(lambda v: type(v).__name__).value_counts()
        if len(types) > 1:
            mixed_types[column] = types.to_dict()
    return mixed_types

In [15]:
mixed_dtypes_detector_1(titanic_df)

{'str': np.int64(889), 'float': np.int64(2)}

In [16]:
mixed_dtypes_detector_2(titanic_df)

{'embarked': {'str': 889, 'float': 2}, 'embark_town': {'str': 889, 'float': 2}}

In [17]:
tmp_dict = {
    'a': 1, 'b': 2, 'c': 3
}

In [18]:
tmp_dict['a']

1

In [19]:
tmp_dict['d'] = 4
tmp_dict

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [20]:
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [21]:
mixed_dtypes = {}
for column in titanic_df.columns:
    types = titanic_df[column].apply(lambda x: type(x).__name__).value_counts()
    if len(types) > 1:
        mixed_dtypes[column] = types.to_dict()
mixed_dtypes

{'embarked': {'str': 889, 'float': 2}, 'embark_town': {'str': 889, 'float': 2}}

In [22]:
def mixed_dtypes_detector(df):
    mixed_dtypes = {}
    for column in df.columns:
        types = df[column].apply(lambda x: type(x).__name__).value_counts()
        if len(types) > 1:
            mixed_dtypes[column] = types.to_dict()
    return mixed_dtypes

mixed_dtypes_detector(taxis_df)

{'payment': {'str': 6389, 'float': 44},
 'pickup_zone': {'str': 6407, 'float': 26},
 'dropoff_zone': {'str': 6388, 'float': 45},
 'pickup_borough': {'str': 6407, 'float': 26},
 'dropoff_borough': {'str': 6388, 'float': 45}}

In [23]:
mixed_dtypes_detector(titanic_df)

{'embarked': {'str': 889, 'float': 2}, 'embark_town': {'str': 889, 'float': 2}}