In [1]:
import pandas as pd
import pandera as pa

In [2]:
series = pd.Series(list(range(1, 11)),name="integer_series")

# Define a Pandera schema for the Series
schema = pa.SeriesSchema(
    pa.Int,  # Specify the data type as integer
    nullable=False,  # Ensure there are no null values
    name="integer_series",  # Optional: name for the series
)

# Validate the Series
validated_series = schema(series)

# Print the validated Series
display(validated_series)

# Print the data type
print(type(validated_series)) 

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: integer_series, dtype: int64

<class 'pandas.core.series.Series'>


In [3]:
s2 = pd.Series({
    "a": 1,
    "b": 2,
    "c": 3,
    "d": 4,
    "e": 5,
    "f": 6,
    "g": 7,
    "h": 8,
    "i": 9,
    "j": 10
})

# Define a Pandera schema for the Series
s2_schema = pa.SeriesSchema(
    pa.Int,  # Specify the data type as integer
    nullable=False,  # Ensure there are no null values
    index=pa.Index(pa.String),  # Specify the index type as string
    checks=[
        pa.Check.in_range(min_value=1, max_value=10),  # Ensure values are between 1 and 10
        # pa.Check(lambda s: s.sum() == 55, error="Sum of values must be 55"),  # Custom check
    ]
)

# Validate the Series
validated_s2 = s2_schema(s2)

# Print the validated Series
print(validated_s2)

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int64


In [4]:
from typing import List 
values : List[int] = [1, 2, 3, 4, 5]
index : List[str] = ['a', 'b', 'c', 'd', 'e']
s1: pd.Series = pd.Series(values, index=index)
print(s1) 

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [5]:
from typing import List 
values : List[int] = [1, 2, 3, 4, 5]
index : List[List[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'], ['a', 'b', 'c', 'd', 'e']]
s1: pd.Series = pd.Series(values, index=index, name='student_data')
print(s1) 

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: student_data, dtype: int64


In [6]:
import numpy as np

values : List[np.int32] = np.array([1, 2, 3, 4, 5], dtype=np.int32)
values

array([1, 2, 3, 4, 5])

In [7]:
# Create a Dataframe using series
s1:pd.Series = pd.Series([1, 2, 3, 4, 5], name='id')
s2:pd.Series = pd.Series([3.06, 2.5, 3.5, 4.00, 2.7], name='cgpa')
s3:pd.Series = pd.Series(['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas'], name='name')
df:pd.DataFrame = pd.DataFrame({'id': s1, 'name': s3, 'cgpa': s2})
df

Unnamed: 0,id,name,cgpa
0,1,Muhammad Ali,3.06
1,2,Anus,2.5
2,3,Maryam Jahangir,3.5
3,4,Eraj Rizvi,4.0
4,5,Yasir Abbas,2.7


In [8]:
# Create a Dataframe using series
s1:pd.Series = pd.Series([1, 2, 3, 4, 5], name='id')
s2:pd.Series = pd.Series([3.06, 2.5, 3.5, 4.00, 2.7], name='cgpa')
s3:pd.Series = pd.Series(['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas'], name='name')
df:pd.DataFrame = pd.concat([s1, s3,s2], axis=1)
df

Unnamed: 0,id,name,cgpa
0,1,Muhammad Ali,3.06
1,2,Anus,2.5
2,3,Maryam Jahangir,3.5
3,4,Eraj Rizvi,4.0
4,5,Yasir Abbas,2.7


In [9]:
from typing import Union
data: List[List[Union[str, int]]] = [[1, 2, 3, 4, 5], [3.06, 2.5, 3.5, 4.00, 2.7], ['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas']]

In [10]:
df = pd.DataFrame(data)
df = df.transpose()
df.columns = ['id', 'cgpa', 'name']
df

Unnamed: 0,id,cgpa,name
0,1,3.06,Muhammad Ali
1,2,2.5,Anus
2,3,3.5,Maryam Jahangir
3,4,4.0,Eraj Rizvi
4,5,2.7,Yasir Abbas


In [11]:
# Create a Dataframe using series
s1:pd.Series = pd.Series([1, 2, 3, 4, 5], name='id')
s2:pd.Series = pd.Series([3.06, 2.5, 3.5, 4.00, 2.7], name='cgpa')
s3:pd.Series = pd.Series(['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas'], name='name')
df:pd.DataFrame = pd.DataFrame({'id': s1, 'name': s3, 'cgpa': s2}, columns=['id', 'name', 'cgpa'])
df.index = ['42201', '42202', '42203', '42204', '42205']
df

Unnamed: 0,id,name,cgpa
42201,1,Muhammad Ali,3.06
42202,2,Anus,2.5
42203,3,Maryam Jahangir,3.5
42204,4,Eraj Rizvi,4.0
42205,5,Yasir Abbas,2.7


In [12]:
df.columns

Index(['id', 'name', 'cgpa'], dtype='object')

In [13]:
df.index

Index(['42201', '42202', '42203', '42204', '42205'], dtype='object')

In [14]:
df.values

array([[1, 'Muhammad Ali', 3.06],
       [2, 'Anus', 2.5],
       [3, 'Maryam Jahangir', 3.5],
       [4, 'Eraj Rizvi', 4.0],
       [5, 'Yasir Abbas', 2.7]], dtype=object)

In [15]:
from nptyping import NDArray, Shape, Int64 
data: NDArray[Shape['Size,Size'], Int64] = np.arange(10 * 10).reshape(10, 10)
df:pd.DataFrame =  pd.DataFrame(data, columns=list('abcdefghij'))
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [16]:
df:List[pd.DataFrame] = pd.read_html('https://www.w3schools.com/python/python_operators.asp')
df[0]

Unnamed: 0,Operator,Name,Example,Try it
0,+,Addition,x + y,Try it »
1,-,Subtraction,x - y,Try it »
2,*,Multiplication,x * y,Try it »
3,/,Division,x / y,Try it »
4,%,Modulus,x % y,Try it »
5,**,Exponentiation,x ** y,Try it »
6,//,Floor division,x // y,Try it »


In [17]:
# URL of the JSON data
url = 'https://www.w3schools.com/python/pandas/data.js'

# Read JSON directly into a DataFrame
df: pd.DataFrame = pd.read_json(url)

# Display the DataFrame
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [18]:
df:pd.DataFrame = pd.DataFrame(
    {
        'id': [1, 2, 3, 4, 5], 
        'cgpa': [3.06, 2.5, 3.5, 4.00, 2.7], 
        'name': ['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas']
    }, 
    columns=['id', 'cgpa', 'name']
)
df.index = ['42201', '42202', '42203', '42204', '42205']

# Define the schema using Pandera's DataFrameSchema
schema = pa.DataFrameSchema(
    {
        # Column 'id': Integer type with unique constraint
        'id': pa.Column(int, checks=pa.Check.in_range(1, 5), unique=True, nullable=False),

        # Column 'name': Float type with range constraint
        'name': pa.Column(str,  nullable=False),

        # Column 'cgpa': String type
        'cgpa': pa.Column(float, checks=pa.Check.in_range(0.0, 5.0), nullable=False)
    },
    index=pa.Index(str, name=None),
    strict=True, # Ensures no extra columns are present
    coerce=True  # Automatically converts data to match schema types
)

# Validate the DataFrame against the schema
# collects all validation errors and reports them simultaneously, rather than stopping at the first error. 
validated_df = schema.validate(df, lazy=True)

# Display the validated DataFrame
print(validated_df)

       id  cgpa             name
42201   1  3.06     Muhammad Ali
42202   2  2.50             Anus
42203   3  3.50  Maryam Jahangir
42204   4  4.00       Eraj Rizvi
42205   5  2.70      Yasir Abbas


In [19]:
print(dir(pa.check))

['__annotations__', '__builtins__', '__call__', '__class__', '__closure__', '__code__', '__defaults__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__get__', '__getattribute__', '__getstate__', '__globals__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__kwdefaults__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__type_params__']


In [20]:
s1: pd.Series = pd.Series(list(range(1, 11)))
print(s1)
print(s1[1:4])

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64
1    2
2    3
3    4
dtype: int64


In [21]:
s1: pd.Series = pd.Series(list(range(1, 11)))
print(s1)
print(s1.iloc[1:4])
print(s1.iloc[1:4:2])

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64
1    2
2    3
3    4
dtype: int64
1    2
3    4
dtype: int64


In [22]:
# Create a Dataframe using series
s1:pd.Series = pd.Series([1, 2, 3, 4, 5], name='id')
s2:pd.Series = pd.Series([3.06, 2.5, 3.5, 4.00, 2.7], name='cgpa')
s3:pd.Series = pd.Series(['Muhammad Ali', 'Anus', 'Maryam Jahangir', 'Eraj Rizvi', 'Yasir Abbas'], name='name')
df:pd.DataFrame = pd.DataFrame({'id': s1, 'name': s3, 'cgpa': s2}, columns=['id', 'name', 'cgpa'])
df.index = ['42201', '42202', '42203', '42204', '42205']
print(f'DataFrame:\n\n{df}\n')
print(f'df.iloc[1:4]:\n\n{df.iloc[1:4]}\n')
print(f'df.iloc[1:4, [0,2]]:\n\n{df.iloc[1:4, [0,2]]}\n')

DataFrame:

       id             name  cgpa
42201   1     Muhammad Ali  3.06
42202   2             Anus  2.50
42203   3  Maryam Jahangir  3.50
42204   4       Eraj Rizvi  4.00
42205   5      Yasir Abbas  2.70

df.iloc[1:4]:

       id             name  cgpa
42202   2             Anus   2.5
42203   3  Maryam Jahangir   3.5
42204   4       Eraj Rizvi   4.0

df.iloc[1:4, [0,2]]:

       id  cgpa
42202   2   2.5
42203   3   3.5
42204   4   4.0



In [23]:
print(f"df.loc[['42202','42205'], ['name', 'cgpa']]:\n\n{df.loc[['42202','42205'], ['name', 'cgpa']]}\n")
# print(f"df.loc[[1, 4], ['name', 'cgpa']]:\n\n{df.loc[[1, 4], ['name', 'cgpa']]}\n")
print(f"df['name'].at['42202']:\n\n{df['name'].at['42202']}")

df.loc[['42202','42205'], ['name', 'cgpa']]:

              name  cgpa
42202         Anus   2.5
42205  Yasir Abbas   2.7

df['name'].at['42202']:

Anus


In [24]:
import re
with open('./chat_contain_roll_no.txt', encoding='utf-8') as f:
    file_content = f.read()
re.findall(r'PIAIC\s?\d{5,6}', file_content)

['PIAIC80919',
 'PIAIC80919',
 'PIAIC80919',
 'PIAIC80919',
 'PIAIC80919',
 'PIAIC80919',
 'PIAIC101499',
 'PIAIC123456',
 'PIAIC73919',
 'PIAIC73919',
 'PIAIC210905',
 'PIAIC120702',
 'PIAIC218333',
 'PIAIC139495',
 'PIAIC58320']

In [52]:
import re
with open('./chat_contain_roll_no.txt', encoding='utf-8') as f:
    file_content = f.read()

# Regular expression pattern to capture time, name, and PIAIC roll number
pattern = r"(\d{2}:\d{2}:\d{2}) From (.*?) to Everyone:\s+.*?(\bPIAIC[-_ ]?\d+|\bPIAIC\d+|\bPIaic\s?\d+|\bPIAIC\d+)\b"

# Find all matches
matches = re.findall(pattern, file_content, re.IGNORECASE | re.DOTALL)
matches

[('20:03:10', 'Dr. Ghulam Shabbir', 'PIAIC80919'),
 ('21:01:33', 'Faizan Hassan', 'PIAIC80919'),
 ('21:02:26', 'Taif Ullah', 'PIAIC80919'),
 ('21:04:42', 'Amanat Wattoo', 'PIAIC80919'),
 ('21:06:00', 'Amanat Wattoo', 'PIAIC80919'),
 ('21:09:50', 'Yasir', 'PIAIC80919'),
 ('21:32:29', 'Ali Zar FSD', 'PIAIC-173738'),
 ('21:41:51', 'Hamza', 'PIAIC-201785'),
 ('21:41:52', 'jhon wick', 'piaic 223880'),
 ('21:41:54', 'Hina Zargham', 'PIAIC101499'),
 ('21:41:54', 'Hatif Humayun', 'PIAIC-52822'),
 ('21:41:54', 'Ahmed Siddiqui', 'PIAIC123456'),
 ('21:41:56', 'Arif Najmi', 'PIAIC73919'),
 ('21:42:00', 'STONE', 'PIAIC_126369'),
 ('21:42:03', '.', 'PIAIC210905'),
 ('21:42:06', 'ABDUL KHALIQ', 'PIAIC-604031'),
 ('21:42:11', 'Arshad Siddiqui', 'PIAIC120702'),
 ('21:42:13', 'Ali Zar FSD', 'PIaic 223972'),
 ('21:42:13', 'Azfar Suhail', 'PIAIC218333'),
 ('21:42:14', 'Kamran Ahmed', 'PIAIC139495'),
 ('21:42:18', 'Ahmed', 'PIAIC-225620'),
 ('21:42:25', 'Kamal Hassan', 'PIAIC58320'),
 ('21:42:29', 'Ahmed',

In [54]:
df:pd.DataFrame = pd.DataFrame(matches, columns=['time', 'name', 'roll_no'])
df.head()

Unnamed: 0,time,name,roll_no
0,20:03:10,Dr. Ghulam Shabbir,PIAIC80919
1,21:01:33,Faizan Hassan,PIAIC80919
2,21:02:26,Taif Ullah,PIAIC80919
3,21:04:42,Amanat Wattoo,PIAIC80919
4,21:06:00,Amanat Wattoo,PIAIC80919
