<a href="https://colab.research.google.com/github/mashdas/rapido_classical/blob/master/augment_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError: # If TPU not found
  tpu = None

In [None]:
# Select appropriate distribution strategy
if tpu:
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.experimental.TPUStrategy(tpu, steps_per_run=128)
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])  
else:
  strategy = tf.distribute.get_strategy() # Default strategy that works on CPU and single GPU
  print('Running on CPU instead')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

##Global frame list
frames=[]

def one_metric(x,y):
    return np.sqrt(x**2+y**2)


def hist_plot(data):
    fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
    n_bins = int(data.shape[0]/2)
    return axs[0].hist(data, bins=n_bins)
    

def augment(csv):
  
  gen_values=0
  data=pd.read_csv(csv,na_values=[' None','NA','NaN'])
  print("STATUS :",data.isnull().sum())
  X=data.copy()
  X["pick_lat"]=X.apply(lambda row:row['pick_lat']*1000000,axis=1)
  X["pick_lng"]=X.apply(lambda row:row['pick_lng']*1000000,axis=1)
  X["time"]=X.apply(lambda row: round(row['time'],2)*100,axis=1)
  ids=set(X["number"].values)
  for id in ids:
    dX=X[X["number"]==id].copy()
    dX["loc"]=dX.apply(lambda row:one_metric(row["pick_lat"],row["pick_lng"]),axis=1)
    xs,ys=dX["loc"],dX["time"]
    ##Histogram plot for sampling negatives(location)
    f_loc_data=hist_plot(xs)
    #f_loc_data[0]##Holds Frequency
    #f_loc_data[1]##Holds Location
    indexes=[]
    for i,x in enumerate(f_loc_data[0]):
      if x==0:
        indexes.append(i)
    neg_positions=[f_loc_data[1][x] for x in indexes ]
    #Histogram for sampling negatives(time)
    f_time_data=hist_plot(ys)
    time_indexes=[]
    for i,x in enumerate(f_time_data[0]):#0 index holds frequency
      if x==0:
        time_indexes.append(i)
    neg_time=[f_time_data[1][i] for i in time_indexes]
    
    mn=min(len(neg_time),len(neg_positions),int(dX.shape[0]/2))
    gen_values+=mn
    neg_time=random.sample(neg_time,mn)
    neg_positions=random.sample(neg_positions,mn)
    ##Dropping the unamed column
    dX.drop(dX.columns[dX.columns.str.contains('Unnamed',case = False)],axis = 1, inplace = True)
    ##Dropping unnescessary values
    dX.drop(["pick_lat","pick_lng"],axis=1,inplace=True)  
    X_df=pd.DataFrame(columns=["number","time","result","loc"])
    ##Populating the negative dataframe
    X_df["number"],X_df["time"],X_df["result"],X_df["loc"]=[int(id) for _ in range(mn)],neg_time,[0 for _ in range(mn)],neg_positions
    ##Combining the negative and positive:
    X_new=None
    X_new=pd.concat([dX,X_df])
    ##Shuffling 
    X_new= X_new.sample(frac=1).reset_index(drop=True)
    frames.append(X_new)
    plt.close('all')

  print(gen_values)   
  return pd.concat(frames)

In [2]:
x=augment('7data.csv')

STATUS : Unnamed: 0    0
number        0
pick_lat      0
pick_lng      0
time          0
result        0
dtype: int64
79880


In [6]:
x.to_csv('7data_augmented.csv')

In [5]:
x.isnull().sum()

number    0
time      0
result    0
loc       0
dtype: int64

In [17]:
x=pd.read_csv('7data.csv')