# dropna

In [1]:
import pandas as pd

students_data = {
    'Name': ['Marta', 'Elisa', None],
    'Age': [28, None, 27],
    'Field': ['Math', 'Physics', 'Biology']
}

students_df = pd.DataFrame(students_data)

cleaned_df = students_df.dropna()
print(cleaned_df)

    Name   Age Field
0  Marta  28.0  Math


# fillna

In [2]:
students_df = pd.DataFrame({
    'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
    'Age': [21, None, None, 21, 23],
    'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
    'Group': ['M19', 'P18', 'B20', 'M19', 'P17']
    })

students_df['Age'].fillna(students_df['Age'].median(), inplace=True)
print(students_df)

    Name   Age    Field Group
0    Ann  21.0     Math   M19
1    Dan  21.0  Physics   P18
2  Ellen  21.0  Biology   B20
3   John  21.0     Math   M19
4    Kat  23.0  Physics   P17


In [3]:
students1_df = pd.DataFrame({
    'Name':  ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
    'Age':   [21, None, None, 21, 23],
    'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
    'Group': ['M19', 'P18', 'B20', 'M19', 'P17'],
    'Passed exams': [None, 2, 3, None, None]
    })

students1_df['Passed exams'].fillna(0, inplace=True)
print(students_df)

    Name   Age    Field Group
0    Ann  21.0     Math   M19
1    Dan  21.0  Physics   P18
2  Ellen  21.0  Biology   B20
3   John  21.0     Math   M19
4    Kat  23.0  Physics   P17


# drop

In [7]:
tabel_studenti = {
    'Name': ['Ann', 'Dan', 'Ellen'],
    'Age': [21, 22, 20],
    'Field': ['Math', 'Physics', 'Biology']
}
tabel_studenti_df = pd.DataFrame(tabel_studenti)

tabel_studenti_df.drop([1], inplace=True)
print(tabel_studenti_df)

    Name  Age    Field
0    Ann   21     Math
2  Ellen   20  Biology


# drop_duplicates

In [8]:
stud= pd.DataFrame({
    'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat', 'John'],
    'Field': ['Math', 'Physics', 'Biology', 'History', 'Physics', 'Math'],
    'Average score': ['90-100', '90-100', '85-89', '75-84', '85-89', '75-84']
})

print(stud)

    Name    Field Average score
0    Ann     Math        90-100
1    Dan  Physics        90-100
2  Ellen  Biology         85-89
3   John  History         75-84
4    Kat  Physics         85-89
5   John     Math         75-84


In [9]:
stud.drop_duplicates(subset='Name', keep='last', inplace=True)
print(stud)

    Name    Field Average score
0    Ann     Math        90-100
1    Dan  Physics        90-100
2  Ellen  Biology         85-89
4    Kat  Physics         85-89
5   John     Math         75-84


# replace

In [10]:
stud1 = pd.DataFrame({
    'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
    'Field': ['Math', 'Fisics', 'Biology', 'Math', 'Fisics'],
    'Average score': ['A', 'A', 'B', 'C', 'B'] 
})
    
stud1.replace({'A': '90-100',
                     'B': '85-89', 
                     'C': '75-84', 
                     'D': '65-74',
                     'E': '60-64'}, inplace=True)
print(stud1)

    Name    Field Average score
0    Ann     Math        90-100
1    Dan   Fisics        90-100
2  Ellen  Biology         85-89
3   John     Math         75-84
4    Kat   Fisics         85-89


In [11]:
stud1.replace(to_replace='Fisics',
                    value='Physics', 
                    inplace=True)
print(stud1)

    Name    Field Average score
0    Ann     Math        90-100
1    Dan  Physics        90-100
2  Ellen  Biology         85-89
3   John     Math         75-84
4    Kat  Physics         85-89


# astype

In [12]:
s_data = {
  'Name': ['Ann', 'Dan', 'Ellen'],
  'Age': [21.0, 22.0, 20.0],
  'Field': ['Math', 'Physics', 'Biology']
}

students3_df = pd.DataFrame(s_data)

students3_df['Age'] = students3_df['Age'].astype(int)
print(students3_df.dtypes)


Name     object
Age       int32
Field    object
dtype: object


# Standardizarea datelor

In [13]:
ts_data = {
    'Name': ['Ann', 'dan', 'vicky'],
    'Age': [21, 22, 20],
    'Field': ['Math', 'PHYSICS', 'biology']
}

students4_df = pd.DataFrame(ts_data)

In [14]:
students4_df['Field'] = students4_df['Field'].str.lower()
students4_df['Name'] = students4_df['Name'].str.capitalize()

print(students4_df)

    Name  Age    Field
0    Ann   21     math
1    Dan   22  physics
2  Vicky   20  biology


# date_range

In [15]:
dates = pd.date_range(start='2021-09-01', freq='D', periods=8)

temperatures = pd.Series([23, 17, 17, 16, 15, 14, 17, 20], index=dates)

print(temperatures)

2021-09-01    23
2021-09-02    17
2021-09-03    17
2021-09-04    16
2021-09-05    15
2021-09-06    14
2021-09-07    17
2021-09-08    20
Freq: D, dtype: int64


# dt accesor

In [16]:
timeseries = pd.Series(['2021-08-01', '2021-08-01'])
timeseries = pd.to_datetime(timeseries)
print(timeseries.dt.year)

0    2021
1    2021
dtype: int32


# timedelta

In [17]:
students5_df = pd.DataFrame({
    'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
    'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
    'Course start': ['2019-09-01', '2018-03-01', '2020-03-01', '2019-03-01', '2017-09-01']
})

print(students5_df)

    Name    Field Course start
0    Ann     Math   2019-09-01
1    Dan  Physics   2018-03-01
2  Ellen  Biology   2020-03-01
3   John     Math   2019-03-01
4    Kat  Physics   2017-09-01


In [18]:
from datetime import timedelta

In [None]:
students5_df['Course start'] = pd.to_datetime(students5_df['Course start'])
students5_df['Course end'] = students5_df['Course start'] + timedelta(days=round(3.75*365))

print(students5_df)

# concat

In [28]:
students1 = pd.DataFrame({'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
                          'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
                          'Group': ['M19', 'P18', 'B20', 'M19', 'P17'],
                          'Average score': [92, 65, 74, 83, 91]},
                         index=['am19', 'bp18', 'ob20', 'im19', 'kp17']
)

print(students1, end='\n\n')

students2 = pd.DataFrame({'Name': ['Elon', 'Irene', 'Margie', 'Stephen',],
                          'Field': ['History', 'Math', 'Computer Science', 'Computer Science'],
                          'Group': ['H19', 'M18', 'CS20', 'CS19'],
                          'Average score': [98, 72, 81, 78]},
                         index=['eh19', 'im18', 'mcs20', 'scs19']
)

print(students2)

       Name    Field Group  Average score
am19    Ann     Math   M19             92
bp18    Dan  Physics   P18             65
ob20  Ellen  Biology   B20             74
im19   John     Math   M19             83
kp17    Kat  Physics   P17             91

          Name             Field Group  Average score
eh19      Elon           History   H19             98
im18     Irene              Math   M18             72
mcs20   Margie  Computer Science  CS20             81
scs19  Stephen  Computer Science  CS19             78


In [29]:
concatenated = pd.concat([students1, students2], join = 'inner')
print(concatenated)

          Name             Field Group  Average score
am19       Ann              Math   M19             92
bp18       Dan           Physics   P18             65
ob20     Ellen           Biology   B20             74
im19      John              Math   M19             83
kp17       Kat           Physics   P17             91
eh19      Elon           History   H19             98
im18     Irene              Math   M18             72
mcs20   Margie  Computer Science  CS20             81
scs19  Stephen  Computer Science  CS19             78


# merge

In [30]:
studentz = pd.DataFrame({'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
                          'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
                          'Group': ['M19', 'P18', 'B20', 'M19', 'P17']},
                         index=['am19', 'bp18', 'ob20', 'im19', 'kp17']
)

print(studentz, end='\n\n')

gradez = pd.DataFrame({'Name': ['Ann', 'Ellen', 'John', 'Kat'],
                       'Physics': [99, None, None, 87],
                       'Math': [78, 68, 91, 89],
                       'Biology': [None, 87, None, None],
                       'Chemistry': [None, 73, None, None]})

print(gradez, end='\n\n')

merged = pd.merge(left=studentz,
                  right=gradez,
                  on='Name',
                  how='outer')

print(merged)

       Name    Field Group
am19    Ann     Math   M19
bp18    Dan  Physics   P18
ob20  Ellen  Biology   B20
im19   John     Math   M19
kp17    Kat  Physics   P17

    Name  Physics  Math  Biology  Chemistry
0    Ann     99.0    78      NaN        NaN
1  Ellen      NaN    68     87.0       73.0
2   John      NaN    91      NaN        NaN
3    Kat     87.0    89      NaN        NaN

    Name    Field Group  Physics  Math  Biology  Chemistry
0    Ann     Math   M19     99.0  78.0      NaN        NaN
1    Dan  Physics   P18      NaN   NaN      NaN        NaN
2  Ellen  Biology   B20      NaN  68.0     87.0       73.0
3   John     Math   M19      NaN  91.0      NaN        NaN
4    Kat  Physics   P17     87.0  89.0      NaN        NaN


# join

In [31]:
students6 = pd.DataFrame({'Name': ['Ann', 'Dan', 'Ellen', 'John', 'Kat'],
                          'Field': ['Math', 'Physics', 'Biology', 'Math', 'Physics'],
                          'Group': ['M19', 'P18', 'B20', 'M19', 'P17']},
                         index=['am19', 'bp18', 'ob20', 'im19', 'kp17']
)

print(students6, end='\n\n')

grades6 = pd.DataFrame({'Physics': [99, None, None, 87],
                       'Math': [78, 68, 91, 89],
                       'Biology': [None, 87, None, None],
                       'Chemistry': [None, 73, None, None]},
                     index=pd.Series(['Ann', 'Ellen', 'John', 'Kat'], name='Name'))

print(grades6, end='\n\n')

joined = students6.join(grades6,
                      on='Name')

print(joined)

       Name    Field Group
am19    Ann     Math   M19
bp18    Dan  Physics   P18
ob20  Ellen  Biology   B20
im19   John     Math   M19
kp17    Kat  Physics   P17

       Physics  Math  Biology  Chemistry
Name                                    
Ann       99.0    78      NaN        NaN
Ellen      NaN    68     87.0       73.0
John       NaN    91      NaN        NaN
Kat       87.0    89      NaN        NaN

       Name    Field Group  Physics  Math  Biology  Chemistry
am19    Ann     Math   M19     99.0  78.0      NaN        NaN
bp18    Dan  Physics   P18      NaN   NaN      NaN        NaN
ob20  Ellen  Biology   B20      NaN  68.0     87.0       73.0
im19   John     Math   M19      NaN  91.0      NaN        NaN
kp17    Kat  Physics   P17     87.0  89.0      NaN        NaN


# apply

In [32]:
dataw = {
    'Data': ['2023-08-01', '2023-08-02', '2023-08-03'],
    'Temperature C': [25, 28, 24]
}

weather_df = pd.DataFrame(dataw)

In [33]:
weather_df['Temperature F'] = weather_df['Temperature C'].apply(lambda temp: (temp * 9/5) + 32)
print(weather_df)

         Data  Temperature C  Temperature F
0  2023-08-01             25           77.0
1  2023-08-02             28           82.4
2  2023-08-03             24           75.2


In [34]:
datai = {
    'Product': ['iPhone 13', 'MacBook Pro', 'Apple Watch'],
    'Price': [699, 1299, 399],
    'Discount': [0.1, 0.05, 0.15]
}

dfi = pd.DataFrame(datai)

dfi['Final Price'] = dfi.apply(lambda row: row['Price'] * (1 - row['Discount']), axis=1)

print(dfi)

       Product  Price  Discount  Final Price
0    iPhone 13    699      0.10       629.10
1  MacBook Pro   1299      0.05      1234.05
2  Apple Watch    399      0.15       339.15


# map

In [36]:
phones = pd.DataFrame({'Price': [12000, 16000, 29000, 40000, 26000],
                       'Second hand price': [11000, 13000, 20000, 35000, 22000]},
                      index=pd.Series(['iPhone X', 'iPhone 12', 'iPhone 14', 'Samsung Galaxy S23',
                                       'Samsung Galaxy S22'], name='Model'))
phones_discounted = phones.map(lambda price: price * 0.9)

print(phones_discounted)

                      Price  Second hand price
Model                                         
iPhone X            10800.0             9900.0
iPhone 12           14400.0            11700.0
iPhone 14           26100.0            18000.0
Samsung Galaxy S23  36000.0            31500.0
Samsung Galaxy S22  23400.0            19800.0


# numpy.where

In [37]:
phones1 = pd.DataFrame({
    'Model': ['iPhone X', 'iPhone 12', 'iPhone 14', 'Samsung Galaxy S23', 'Samsung Galaxy S22'],
    'Year': [2017, 2020, 2022, 2023, 2022],
    'Price': [12000, 16000, 29000, 40000, 26000]
})

print(phones1)

                Model  Year  Price
0            iPhone X  2017  12000
1           iPhone 12  2020  16000
2           iPhone 14  2022  29000
3  Samsung Galaxy S23  2023  40000
4  Samsung Galaxy S22  2022  26000


In [38]:
from numpy import where

where(phones1['Year'] < 2022,
      phones1['Price']*0.8,
      phones1['Price'])

array([ 9600., 12800., 29000., 40000., 26000.])

In [39]:
phones1['Promo price'] = where(phones1['Year'] < 2022,
                               phones1['Price']*0.8,
                               phones1['Price'])

In [40]:
print(phones1)

                Model  Year  Price  Promo price
0            iPhone X  2017  12000       9600.0
1           iPhone 12  2020  16000      12800.0
2           iPhone 14  2022  29000      29000.0
3  Samsung Galaxy S23  2023  40000      40000.0
4  Samsung Galaxy S22  2022  26000      26000.0
