In [1]:
!pip -q install rich

In [2]:
from rich.progress import track

In [3]:
from tqdm import tqdm
import time

### Create a progress bar with tqdm and rich
Using the progress bar is beneficial when you want to have a visual status of a given task

In [4]:
# Implement the callback function
def compute_double(x):
    return 2*x

In [5]:
# Create a progress bar - Rich Progress Bar Implementation
final_dict_doubles = {}

for i in track(range(20), description="Computing 2.n..."):
    final_dict_doubles[f"Value = {i}"] = f"double = {compute_double(i)}"
    
    # Sleep the process to highlight the progress
    time.sleep(0.8)

Output()

In [6]:
# Create a progress bar - TQDM Progress Bar Implementation
for i in tqdm(range(20), desc="Computing 2.n..."):
    final_dict_doubles[f"Value = {i}"] = f"double = {compute_double(i)}"
    
    # Sleep the process to highlight the progress
    time.sleep(1)

Computing 2.n...: 100%|██████████| 20/20 [00:20<00:00,  1.01s/it]


### Get day, month, year, day of the week, the month of the year

In [8]:
import pandas as pd
candidate_info = {
     'Name':["Aisha","Vinod","Pia","Maya","Kanika", "Mohan"],
    'Degree':['Master','Master','Bachelor', "PhD", "Master", "PhD"],
    'From':["Ahmedabad","Mumbai","Delhi", "Pune","Bangalore", "Kerala"],
    'Application_date': ['11/17/2022', '09/23/2022', '12/2/2021', 
                         '08/25/2022', '01/07/2022', '12/26/2022']
}
# Read the dataframe
candidate_df = pd.DataFrame(candidate_info)

In [10]:
# Format datetime 
candidate_df['Application_date'] = pd.to_datetime(candidate_df["Application_date"])

In [11]:
# GET the Values
application_date = candidate_df["Application_date"]

candidate_df["Day"] = application_date.dt.day
candidate_df["Month"] = application_date.dt.month
candidate_df["Year"] = application_date.dt.year
candidate_df["Day_of_week"] = application_date.dt.day_name()
candidate_df["Month_of_year"] = application_date.dt.month_name()

In [12]:
candidate_df

Unnamed: 0,Name,Degree,From,Application_date,Day,Month,Year,Day_of_week,Month_of_year
0,Aisha,Master,Ahmedabad,2022-11-17,17,11,2022,Thursday,November
1,Vinod,Master,Mumbai,2022-09-23,23,9,2022,Friday,September
2,Pia,Bachelor,Delhi,2021-12-02,2,12,2021,Thursday,December
3,Maya,PhD,Pune,2022-08-25,25,8,2022,Thursday,August
4,Kanika,Master,Bangalore,2022-01-07,7,1,2022,Friday,January
5,Mohan,PhD,Kerala,2022-12-26,26,12,2022,Monday,December


### Smallest and largest values of the column
If you want to get the rows with the largest or lowest values for a given column :
1. **df.nlargest(N, "Col_Name")** -> top N rows based on Col_Name
2. **df.nsmallest(N, "Col_Name")** -> N smallest rows based on Col_Name

Note : **Col_Name** is the name of the column you're interested in

In [19]:
student_info = {
    'Name': ["Arpit", "Aisha", "Pia", "Mohan", "Kanika", "Mansi"],
    'Degree': ["MS", "Bachelor", "PhD", "Master", "BS", "Masters"],
    'From' : ["Bangalore", "Mumbai", "Kerala", "Delhi", "Ahmedabad", "Pune"],
    'Age' : [26, 19, 30, 23, 24, 25]
}

student_data = pd.DataFrame(student_info)

In [20]:
student_data

Unnamed: 0,Name,Degree,From,Age
0,Arpit,MS,Bangalore,26
1,Aisha,Bachelor,Mumbai,19
2,Pia,PhD,Kerala,30
3,Mohan,Master,Delhi,23
4,Kanika,BS,Ahmedabad,24
5,Mansi,Masters,Pune,25


In [21]:
# 3 Youngest Students
student_data.nsmallest(3, "Age")

Unnamed: 0,Name,Degree,From,Age
1,Aisha,Bachelor,Mumbai,19
3,Mohan,Master,Delhi,23
4,Kanika,BS,Ahmedabad,24


In [22]:
# 3 Oldest Students
student_data.nlargest(3, "Age")

Unnamed: 0,Name,Degree,From,Age
2,Pia,PhD,Kerala,30
0,Arpit,MS,Bangalore,26
5,Mansi,Masters,Pune,25


### Ignore the log output of the pip install command
Sometimes when installing a library from your jupyter notebook, you might not want to have all the details about the installation process generated by default.

**pip install** command

You can specify the -q OR -quiet option to get rid of that information

In [23]:
!pip -q install spacy-transformers

### Run multiple commands in a single notebook cell

Use **%%bash** expression at the beginning of the notebook

### Virtual Environment

One way of doing this is to use virtual environments

1. Install the virtual environment module : **pip install virtualenv**

2. Create your environment by giving a meaningful name : **virtualenv[your_environment_name]**

3. Activate your environment : **source[your_environment_name]/bin/activate**

4. Start installing the dependencies for your project : **pip install pandas**

If you need to permanently save those dependencies in order to share them with others using this command:
**pip freeze > requirements.txt

This will create **requirements.txt** file containing your project dependencies.

Finally anyone can install the exact same dependencies by running this command : **pip install -r requirements.txt**

### Run multiple metrics at once
Scikit Learn metrics

In [24]:
# Individual Imports
from sklearn.metrics import precision_score, recall_score, f1_score

y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]

print("Precision : ", precision_score(y_true, y_pred, average='macro'))
print("Recall : ", recall_score(y_true, y_pred, average='macro'))
print("F1 Score : ", f1_score(y_true, y_pred, average='macro'))

Precision :  0.2222222222222222
Recall :  0.3333333333333333
F1 Score :  0.26666666666666666


In [28]:
# Single Line Import 
from sklearn.metrics import precision_recall_fscore_support 

In [29]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, 
                                                                 y_pred, 
                                                                 average='macro')

In [30]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.2222222222222222
Recall: 0.3333333333333333
F1 Score: 0.26666666666666666


### Chain multiple lists as a single sequence
This can be achieved using the **chain**() function from Python **itertools** module

In [31]:
from itertools import chain

In [32]:
# List of clients
clients = ["LG", "Samsung", "Sony"]
place_of_interests = ["Newyork", "California", "Chicago"]

# Iterate through the two lists at the same time
for client in chain(clients, place_of_interests):
    print(client)

LG
Samsung
Sony
Newyork
California
Chicago


### Pretty print of JSON data
The indent parameter of the dumps() method can be used to specify the indentation level of your formatted string output.

In [33]:
# Import the library
import json

# Initialize the JSON data
candidate_data = '[{"Name":"Adrianne", "Age":23, \
                    "Degrees":["Bachelor", "Master"]},\
                    {"Name":"Nalia", "Age":28, \
                    "Degrees":["Bachelor", "Master", "PhD"]}]'

# Create Python object from JSON string
candidate_Json = json.loads(candidate_data)

# Print with indentation level of 4
print(json.dumps(candidate_Json, indent = 4))

[
    {
        "Name": "Adrianne",
        "Age": 23,
        "Degrees": [
            "Bachelor",
            "Master"
        ]
    },
    {
        "Name": "Nalia",
        "Age": 28,
        "Degrees": [
            "Bachelor",
            "Master",
            "PhD"
        ]
    }
]


### Unit-Testing
Benefits :
1. Better quality code
2. Allows simpler and more agile code when adding new features
3. Reduces cost by saving dev time and avoiding later stages of error discovery

Use **unittest** to perform unit testing 

In [34]:
# Import the library
import unittest

# Function to test
def product(a, b):
    return a * b

class TestProduct(unittest.TestCase):
    # Implement the unit test
    def test_product_two_variables(self):
        self.assertNotEqual(16, product(4,3))

### Iterate over multiple lists
Go to approach is the Python **zip** function

In [35]:
# List of names
names = ["Sonia", "Kanika", "Nalia"]

# List of locations
locations = ["Atlanta", "Arizona", "Chicago"]

# Simultaneous iteration
for name, location in zip(names, locations):
    print(f'{name} : {location}')

Sonia : Atlanta
Kanika : Arizona
Nalia : Chicago


### Alternative to nested for loops
Use python built-in **product**() function instead.

In [36]:
# Define the lists
first_list = [4, 12, 6]
second_list = [5, 2, 19]
third_list = [7, 2, 3]

In [37]:
# Run the cartesian product
## Method 1
for fst in first_list:
    for snd in second_list:
        for trd in third_list:
            if(fst * snd * trd == 24):
                print(f'1st: {fst} 2nd: {snd} 3rd:{trd}')

1st: 4 2nd: 2 3rd:3
1st: 6 2nd: 2 3rd:2


In [40]:
# Run the cartesian product
## Method 2
from itertools import product
for fst, snd, trd in product(first_list, second_list, third_list):
    if(fst * snd * trd == 24):
        print(f'1st: {fst} 2nd: {snd} 3rd:{trd}')

1st: 4 2nd: 2 3rd:3
1st: 6 2nd: 2 3rd:2


### Text preprocessing made easy
How many functions or regular expressions do you have to write to perform basic text processing tasks like :

1. Fixing Unicode
2. Removing URLs
3. Getting rid of digits, punctuation, etc?

Those tasks are not only time-consuming but may also result in complexity depending on the text

Using the **clean-text** Python library can take away all that burden

In [63]:
from cleantext import clean

info = ''' "https://www.youtube.com/@DataInterviewPro"
Emma Ding teaches Machine Learning on her YouTube 
channel Emma Ding 
Also, she has written more than 10 articles on Medium. 
Check the links below. '''

cleaned_text = clean(info)

print(cleaned_text)

httpswwwyoutubecomdatainterviewpro emma ding teach machin learn youtub channel emma ding also written articl medium check link
