# Creating Pandas Data Structures

### Creating Pandas Series

In [17]:
import pandas as pd
series=pd.Series([10,20,30,40,50])
print(series)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [30]:
#trying series with different data type (strings)
series=pd.Series([10,20,30,40,50],["Sara","Mariam","Ziad","Mohamed","Jana"])
print(series)

Sara       10
Mariam     20
Ziad       30
Mohamed    40
Jana       50
dtype: int64


### Creating Data Frame From a Dictionary

In [201]:
data={'Name':['Mariam','Jana','Fatma'],
      'Age':[21,19,20],
      'Major':['Engineering','Pharmacy','Dentistry'],
      'Birth Month':[9,1,11],
      'Birth place':['Cairo','Alexandria','Assiut']
     }
df=pd.DataFrame(data)
print(df)

     Name  Age        Major  Birth Month Birth place
0  Mariam   21  Engineering            9       Cairo
1    Jana   19     Pharmacy            1  Alexandria
2   Fatma   20    Dentistry           11      Assiut


# Viewing and Inspecting Data

### Viewing the First Few Rows


In [203]:
data={'Name':['Mariam','Jana','Fatma'],
      'Age':[21,19,20],
      'Major':['Engineering','Pharmacy','Dentistry'],
      'Birth Month':[9,1,11],
      'Birth place':['Cairo','Alexandria','Assiut']
     }
df=pd.DataFrame(data)
df.head(2) # the number betwwen brackets indicate the number of rows that u want to display

Unnamed: 0,Name,Age,Major,Birth Month,Birth place
0,Mariam,21,Engineering,9,Cairo
1,Jana,19,Pharmacy,1,Alexandria


### Getting DataFrame Information

In [205]:
df.info() #Displaying the info about the whole data frame (null / non-null/ rows/ columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         3 non-null      object
 1   Age          3 non-null      int64 
 2   Major        3 non-null      object
 3   Birth Month  3 non-null      int64 
 4   Birth place  3 non-null      object
dtypes: int64(2), object(3)
memory usage: 252.0+ bytes


### Describing Statistical Information

In [207]:
df.describe() #Displaying descriptive statistics to make sure that it doesn't include outliers or any logical errors

Unnamed: 0,Age,Birth Month
count,3.0,3.0
mean,20.0,7.0
std,1.0,5.291503
min,19.0,1.0
25%,19.5,5.0
50%,20.0,9.0
75%,20.5,10.0
max,21.0,11.0


# Selecting and Filtering Data

### Selecting Columns

In [209]:
print(df['Age']) #you can select only one column as it returns series only

0    21
1    19
2    20
Name: Age, dtype: int64


### Filtering Rows Based on Condition

In [211]:
filtered_dataframe=df[df['Birth Month']>10]
print(filtered_dataframe)

    Name  Age      Major  Birth Month Birth place
2  Fatma   20  Dentistry           11      Assiut


In [213]:
filtered_dataframe=df[df['Major']!='Engineering']
print(filtered_dataframe)

    Name  Age      Major  Birth Month Birth place
1   Jana   19   Pharmacy            1  Alexandria
2  Fatma   20  Dentistry           11      Assiut


### Selecting Specific Rows and Columns


In [215]:
selected_data=df.loc[1:2,['Major','Birth place']]
print(selected_data)

       Major Birth place
1   Pharmacy  Alexandria
2  Dentistry      Assiut


In [217]:
selected_data=df.loc[1:2] #Displaying all the columns by default
print(selected_data)

    Name  Age      Major  Birth Month Birth place
1   Jana   19   Pharmacy            1  Alexandria
2  Fatma   20  Dentistry           11      Assiut


In [177]:
selected_data=df.iloc[1:3,[0,2,4]] #Using indicies to locate rows and columns keeping columns always in seprated square brackets
print(selected_data)

    Name      Major Birth place
1   Jana   Pharmacy  Alexandria
2  Fatma  Dentistry      Assiut


# Modifying Data

### Adding a New Column

In [219]:
df['Sports']=['Football','Basketball','Swimming']
print(df)

     Name  Age        Major  Birth Month Birth place      Sports
0  Mariam   21  Engineering            9       Cairo    Football
1    Jana   19     Pharmacy            1  Alexandria  Basketball
2   Fatma   20    Dentistry           11      Assiut    Swimming


### Updating Column Values


In [221]:
df['Age']=df['Age']+1
print("==========In the next year===========")
print(df)

     Name  Age        Major  Birth Month Birth place      Sports
0  Mariam   22  Engineering            9       Cairo    Football
1    Jana   20     Pharmacy            1  Alexandria  Basketball
2   Fatma   21    Dentistry           11      Assiut    Swimming


### Dropping Columns

In [226]:
df=df.drop(0, axis=0) #dropping a row
print(df)

    Name  Age      Major  Birth Month Birth place      Sports
1   Jana   20   Pharmacy            1  Alexandria  Basketball
2  Fatma   21  Dentistry           11      Assiut    Swimming


In [228]:
df=df.drop('Age', axis=1) #dropping a column
print(df)

    Name      Major  Birth Month Birth place      Sports
1   Jana   Pharmacy            1  Alexandria  Basketball
2  Fatma  Dentistry           11      Assiut    Swimming


# Handling Missing Data

### Detecting Missing Values

In [252]:
dtf=pd.DataFrame({'Mariam Goals':[20,16,None],'Fatma Goals':[18,None,21],'Sara Goals':[None,None,22]})
print(dtf.isnull()) #Displaying Boolean values


   Mariam Goals  Fatma Goals  Sara Goals
0         False        False        True
1         False         True        True
2          True        False       False


In [254]:
print(dtf) #Displaying real values

   Mariam Goals  Fatma Goals  Sara Goals
0          20.0         18.0         NaN
1          16.0          NaN         NaN
2           NaN         21.0        22.0


In [256]:
dtf.isna() #As same as isnull() they can be used interchangabely

Unnamed: 0,Mariam Goals,Fatma Goals,Sara Goals
0,False,False,True
1,False,True,True
2,True,False,False


In [258]:
dtf.isnull().sum().count() #sum to sum up all null values per column and count the number of columns that contain null

3

In [262]:
dtf.isnull().sum().sum() #sum to sum up all null values per column and another sum to sum up all the nulls in the data frame

4

### Filling Missing Values

In [267]:
filling_dtf=dtf.fillna(10) #The value between brackets that fill out the nones
print(filling_dtf)

   Mariam Goals  Fatma Goals  Sara Goals
0          20.0         18.0        10.0
1          16.0         10.0        10.0
2          10.0         21.0        22.0


In [269]:
dtf['Fatma Goals']=dtf['Fatma Goals'].fillna(dtf['Fatma Goals'].mean())
print(dtf['Fatma Goals'])

0    18.0
1    19.5
2    21.0
Name: Fatma Goals, dtype: float64


### Dropping Rows with Missing Values

In [272]:
dtf.dropna() # by default it drops rows

Unnamed: 0,Mariam Goals,Fatma Goals,Sara Goals


In [274]:
dtf.dropna(axis=1) # droping columns that include any null value

Unnamed: 0,Fatma Goals
0,18.0
1,19.5
2,21.0


# Grouping and Aggregation

### Applying Custom Functions with apply()

In [334]:
def Bonus(x):
    return x + 1000

employees_Data = pd.DataFrame({
    'Name': ['Mariam', 'Ali', 'Ziad'],
    'Salary': [8000, 10000, 9000],
    'Department': ['IT', 'HR', 'Social Media']
})

# Apply the bonus conditionally to the Salary column
employees_Data['Salary after bonus'] = employees_Data['Salary'].apply(lambda x: Bonus(x) if x < 9000 else x)

print(employees_Data)

     Name  Salary    Department  Salary after bonus
0  Mariam    8000            IT                9000
1     Ali   10000            HR               10000
2    Ziad    9000  Social Media                9000


### Group by function

In [340]:
company_data=pd.DataFrame({'Name':['Mariam','Mayar','Fatma'],
              'Department':['IT','IT','Social Media'],
                          'Salary':[20000,15000,10000]})
print(company_data.groupby('Department').sum())


                     Name  Salary
Department                       
IT            MariamMayar   35000
Social Media        Fatma   10000
