# Basic commands in Pandas - creation of Pandas objects

We run through the basic commands in pandas for creating and working with series, dataframes, index objects

Reference : [Python DataScience Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/), Chapter 3, Introducing Pandas Objects

In [1]:
import pandas as pd

# Series

## Creating a Series

In [3]:
#Creating a series from a list
series_1 = pd.Series([2,4,6,8])
series_1

0    2
1    4
2    6
3    8
dtype: int64

In [4]:
#Creating a series from a list and specifying indices
series_2 = pd.Series([2,4,6,8], index = ['a', 'b', 'c', 'd'])
series_2

a    2
b    4
c    6
d    8
dtype: int64

In [5]:
#Creating a series from a dictionary
series_3 = pd.Series({'e': 2, 'f': 4, 'g': 6, 'h': 8})
series_3

e    2
f    4
g    6
h    8
dtype: int64

In [12]:
#Creating a series from part of a dictionary
series_4 = pd.Series({'e': 2, 'f': 4, 'g': 6, 'h': 8}, index = ['g', 'h'])
series_4

g    6
h    8
dtype: int64

In [15]:
#Creating a constant value series
series_5 = pd.Series('c', index=[1,3,5,7])
series_5

1    c
3    c
5    c
7    c
dtype: object

## Accessing data from a series

In [13]:
# Accessing an element from a series
series_1[1], series_2['a'], series_2.iloc[0], series_4.loc['g']

(4, 2, 2, 6)

In [18]:
#All values of a series
series_3.values, type(series_3.values)

(array([2, 4, 6, 8]), numpy.ndarray)

In [19]:
#Indices of a series
series_3.index, type(series_3.index)

(Index(['e', 'f', 'g', 'h'], dtype='object'), pandas.core.indexes.base.Index)

In [26]:
#Slice of a series
series_3[['e', 'f']], type(series_3[['e', 'f']])

(e    2
 f    4
 dtype: int64,
 pandas.core.series.Series)

In [25]:
#Slice of a series
series_3[0:3], type(series_3[0:2])

(e    2
 f    4
 g    6
 dtype: int64,
 pandas.core.series.Series)

In [28]:
#Slice of a series
series_3['e':'g'], type(series_3['e':'g'])

(e    2
 f    4
 g    6
 dtype: int64,
 pandas.core.series.Series)

# Index

In [47]:
a = pd.Series([1,3,5,7], index=[0,3,2,1])
b = pd.Series([10, 'a', 'b'], index = [1, 'A', 'B'])

In [48]:
#Index objects of series
a.index, b.index

(Int64Index([0, 3, 2, 1], dtype='int64'), Index([1, 'A', 'B'], dtype='object'))

In [51]:
#Shape and type of index object
a.index.size, a.index.shape, a.index.ndim, a.index.dtype


(4, (4,), 1, dtype('int64'))

## Index object as an ordered set

In [41]:
#Set operations on index objects : intersection, union, symmetric difference
a.index & b.index, a.index | b.index, a.index ^ b.index

(Index([1], dtype='object'),
 Index([0, 1, 2, 3, 'A', 'B'], dtype='object'),
 Index([0, 2, 3, 'A', 'B'], dtype='object'))

In [46]:
#Set operations as methods of index objects
a.index.intersection(b.index),a.index.union(b.index),a.index.symmetric_difference(b.index)

(Index([1], dtype='object'),
 Index([0, 1, 2, 3, 'A', 'B'], dtype='object'),
 Index([0, 2, 3, 'A', 'B'], dtype='object'))

## Index object as an immutable array

Immutable because you cannot modify an index value .. a.index[0] = 'something_else' will give an error

In [55]:
#Array operations
a.index, a.index[:-1], a.index[[1,2]], a.index[0::2]

(Int64Index([0, 3, 2, 1], dtype='int64'),
 Int64Index([0, 3, 2], dtype='int64'),
 Int64Index([3, 2], dtype='int64'),
 Int64Index([0, 2], dtype='int64'))

# DataFrames

## Creating a DataFrame

In [59]:
#Creating an empty dataframe
column_names_1 = ["A","B", "C","D"]
row_names_1 = ['name','place','animal', 'thing']
df_1 = pd.DataFrame(columns = column_names_1, index = row_names_1)
df_1

Unnamed: 0,A,B,C,D
name,,,,
place,,,,
animal,,,,
thing,,,,


In [66]:
#Creating a dataframe from a list of lists/2-d numpy arrays
df_2 = pd.DataFrame([[1,2,3,4],[5,6,7,8]], columns = ['c1', 'c2', 'c3', 'c4'], index = ['cat', 'dog'])
df_2

Unnamed: 0,c1,c2,c3,c4
cat,1,2,3,4
dog,5,6,7,8


In [79]:
#Creating a dataframe from a dictionary of series objects
series_a = pd.Series([23,45,231,21], index = ['X', 'Y', 'Z', 'W'])
series_b = pd.Series([12,15,31,43], index = ['X', 'Y', 'c', 'd'])

df_3 = pd.DataFrame({'a_series': series_a,
              'b_series': series_b})
df_3

Unnamed: 0,a_series,b_series
W,21.0,
X,23.0,12.0
Y,45.0,15.0
Z,231.0,
c,,31.0
d,,43.0


In [77]:
#Creating a dataframe from a dictionary of dictionaries
dict_a = {'X':23, 'Y':45, 'Z':231, 'W':21}
dict_b = {'X': 12, 'Y': 15, 'c': 31, 'd':43}

df_4 = pd.DataFrame({'a_dict': series_a,
              'b_dict': series_b})
df_4

Unnamed: 0,a_dict,b_dict
W,21.0,
X,23.0,12.0
Y,45.0,15.0
Z,231.0,
c,,31.0
d,,43.0


In [78]:
#Creating a dataframe from a list of series objects
series_a = pd.Series([23,45,231,21], index = ['X', 'Y', 'Z', 'W'])
series_b = pd.Series([12,15,31,43], index = ['X', 'Y', 'c', 'd'])

df_5 = pd.DataFrame([series_a,series_b])
df_5

Unnamed: 0,X,Y,Z,W,c,d
0,23.0,45.0,231.0,21.0,,
1,12.0,15.0,,,31.0,43.0


In [76]:
#Creating a dataframe from a list of dictionaries
dict_a = {'X':23, 'Y':45, 'Z':231, 'W':21}
dict_b = {'X': 12, 'Y': 15, 'c': 31, 'd':43}

df_6 = pd.DataFrame([dict_a,dict_b])
df_6

Unnamed: 0,X,Y,Z,W,c,d
0,23,45,231.0,21.0,,
1,12,15,,,31.0,43.0


## Getting columns and row names 

In [64]:
new_df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [65]:
type(new_df.columns)

pandas.core.indexes.base.Index

In [66]:
new_df.index

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')

In [67]:
type(new_df.index)

pandas.core.indexes.numeric.Int64Index

## Accessing and adding entries to a cell in the dataframe

In [82]:
new_df.loc[2,'C'] = '2+C'

In [83]:
new_df

Unnamed: 0,A,B,C,D
1,new_a,new_b,new_c,new_d
2,b,q,2+C,new_f
3,c,r,,
4,d,s,,
5,e,t,,
6,,,,new_g


In [70]:
new_df.loc[3,'D']

nan

## Adding entries to a column in the dataframe

In [71]:
new_df['A'] = ['1+a', '2+a', '3+a','4+a', '5+a', '6+a']

In [72]:
new_df['B'] = pd.Series(['1+b', '2+b', '3+b', '4+b', '5+b', '6+B'], index=[1,2,3,4,5,6])

In [73]:
new_df

Unnamed: 0,A,B,C,D
1,a,p,,
2,b,q,something,
3,c,r,,
4,d,s,,
5,e,t,,
6,f,u,,


## Adding entries to a row in the dataframe

In [74]:
new_df.loc[1] = ['new_a +1', 'new_b + 1', 'new_c + 1', 'new_d+1']

In [75]:
new_df.loc[2, -2:] = ['new_c+2', 'new_d+2']

In [76]:
new_df.loc[6] = pd.Series('new_g', index=['D'])

In [77]:
new_df

Unnamed: 0,A,B,C,D
1,new_a,new_b,new_c,new_d
2,b,q,new_e,new_f
3,c,r,,
4,d,s,,
5,e,t,,
6,,,,new_g


## Accessing multiple entries in the dataframe

In [78]:
new_df.loc[:, 'A']

1    new_a
2        b
3        c
4        d
5        e
6      NaN
Name: A, dtype: object

In [79]:
new_df.loc[[1,2], ['A']]

Unnamed: 0,A
1,new_a
2,b


In [80]:
new_df.iloc[:, 1]

1    new_b
2        q
3        r
4        s
5        t
6      NaN
Name: B, dtype: object

In [81]:
new_df.iloc[1:3, 2:4]

Unnamed: 0,C,D
2,new_e,new_f
3,,


# Dropping columns

In [72]:
#Drop unnecessary column
del new_df['C']

In [70]:
#Read csv data and put it into a dataframe
eval_results_df= pd.read_csv('eval_results_final.csv')


In [73]:
#Add a new column called solved (1 if the model solves the puzzle completely correctly, 0 otherwise)
eval_results_df['solved'] = eval_results_df.apply(lambda row: 1 if row.no_of_pieces == row.correct_pos_and_orientation else 0, axis=1)

In [75]:
#creating dataframes for individual models
model_names = ['AdjacencyClassifier_NoML', 'FromScratch', 'ResNetFT']
models_stats_dict= {}
for model_name in model_names:
    models_stats_dict[model_name]=eval_results_df[eval_results_df.model==model_name].reset_index(drop=True)


# Evaluation

## Sizes of test puzzles

In [76]:
sizes_of_puzzles = list(eval_results_df.no_of_pieces.unique())

In [77]:
puzzle_size_freq = eval_results_df['no_of_pieces'].value_counts()//3

In [78]:
puzzle_size_freq = puzzle_size_freq.rename_axis('puzzle_sizes').reset_index(name='no_of_puzzles')

In [79]:
puzzle_size_freq

Unnamed: 0,puzzle_sizes,no_of_puzzles
0,36,67
1,25,11
2,16,2


## Statistics

In [80]:
eval_stats_df = pd.DataFrame(columns=model_names, index=['avg_time_taken','percentage_solved'])

In [81]:
#average time taken
avg_time_taken = {}
for model_name in model_names:
    avg_time_taken[model_name] =models_stats_dict[model_name]["time_taken"].mean()
    
eval_stats_df.loc['avg_time_taken'] = pd.Series(avg_time_taken)

In [82]:
#percentage solved
percentage_solved = {}
for model_name in model_names:
    percentage_solved[model_name] = models_stats_dict[model_name]["solved"].sum()/no_of_test_images
    
eval_stats_df.loc['percentage_solved'] = pd.Series(percentage_solved)

In [83]:
#average puzzle size
avg_puzzle_sizes = {}
for model_name in model_names:
    avg_puzzle_sizes[model_name] =models_stats_dict[model_name]["no_of_pieces"].mean()
    
eval_stats_df.loc['avg_puzzle_sizes'] = pd.Series(avg_puzzle_sizes)

In [84]:
eval_stats_df

Unnamed: 0,AdjacencyClassifier_NoML,FromScratch,ResNetFT
avg_time_taken,11.2126,18.9795,23.0482
percentage_solved,0.875,0.375,0.875
avg_puzzle_sizes,33.9875,33.9875,33.9875


## Comparison of models

In [85]:
#examples' solved statuses vs models
comparison_df = pd.DataFrame(columns=model_names, index=list(range(no_of_test_images)))

for model_name in model_names:
    comparison_df[model_name]= models_stats_dict[model_name]["solved"]

In [86]:
#comparing the symmetric difference of two models at a time
model_name_pairs = []
for i,j in [(0,1), (0,2), (1,2)]:
    
    models_in_comparison = model_names[i]+"/"+model_names[j]
    model_name_pairs.append(models_in_comparison)
    comparison_df[models_in_comparison] = comparison_df.apply(lambda row: row[model_names[i]] ^ row[model_names[j]], axis=1)

In [87]:
for model_name_pair in model_name_pairs:
    first_model = model_name_pair.split("/")[0]
    second_model = model_name_pair.split("/")[1]
    
    a = eval_stats_df.loc['percentage_solved'][first_model]*no_of_test_images 
    b = eval_stats_df.loc['percentage_solved'][second_model]*no_of_test_images
    #a+b-2x = comparison_df[model_name_pair].sum()
    #solved_jointly = x
    solved_jointly = int((a+b-comparison_df[model_name_pair].sum())/2)
    solved_by_first_model_alone = int(a - solved_jointly)
    solved_by_second_model_alone = int(b - solved_jointly)
    unsolved_jointly = int(no_of_test_images - (a+b-solved_jointly))
    print("")
    print(f"Comparing {first_model} and {second_model}")
    print("")
    print(f"Solved jointly = {solved_jointly}")
    print(f"Unsolved jointly = {unsolved_jointly}")
    print(f"Solved by {first_model} alone = {solved_by_first_model_alone}")
    print(f"Solved by {second_model} alone = {solved_by_second_model_alone}")
    print("")
    print("********************************")
    
    


Comparing AdjacencyClassifier_NoML and FromScratch

Solved jointly = 30
Unsolved jointly = 10
Solved by AdjacencyClassifier_NoML alone = 40
Solved by FromScratch alone = 0

********************************

Comparing AdjacencyClassifier_NoML and ResNetFT

Solved jointly = 66
Unsolved jointly = 6
Solved by AdjacencyClassifier_NoML alone = 4
Solved by ResNetFT alone = 4

********************************

Comparing FromScratch and ResNetFT

Solved jointly = 30
Unsolved jointly = 10
Solved by FromScratch alone = 0
Solved by ResNetFT alone = 40

********************************


In [88]:
#Examples solved by all three models
comparison_df["solved_by_all_three"] = comparison_df.apply(lambda row: 1 if row[model_names[0]]==1 and row[model_names[1]] == 1 and row[model_names[2]]==1 else 0, axis=1)

In [89]:
solved_by_all_three = comparison_df["solved_by_all_three"].sum()

In [90]:
#Examples solved by at least one of the three models
comparison_df["solved_by_atleast_one"] = comparison_df.apply(lambda row: row[model_names[0]] + row[model_names[1]] + row[model_names[2]], axis=1)

In [91]:
solved_by_atleast_one = len(comparison_df[comparison_df["solved_by_atleast_one"]>0])

In [92]:
solved_by_none = len(comparison_df[comparison_df["solved_by_atleast_one"]<=0])

In [93]:
print(f'Solved by all the three models = {solved_by_all_three}')
print(f'Solved by at least one model = {solved_by_atleast_one}')
print(f'Solved by none of the models = {solved_by_none}')

Solved by all the three models = 30
Solved by at least one model = 74
Solved by none of the models = 6


## Examples on which models underperformed

### Extracting information from the dataframes

In [97]:
unsolved_indices_dict = {}
unsolved_indices_dict["All"]= list(comparison_df[comparison_df["solved_by_atleast_one"]<=0].index)
for model_name in model_names:
    unsolved_indices_dict[model_name] = list(models_stats_dict[model_name][models_stats_dict[model_name]["solved"]==0].index)


In [98]:
unsolved_images_dict = {}
for key in unsolved_indices_dict:
    unsolved_images_dict[key] = set()
    for i in unsolved_indices_dict[key]:
        unsolved_images_dict[key].add(models_stats_dict["AdjacencyClassifier_NoML"].loc[i]["image_name"])