# Sampling

In [None]:
""" Techniques used for sampling. What is the main advantage of sampling?

Sampling is defined as the process of selecting a sample from a group of people 
or from any particular kind for research purposes. It is one of the most 
important factors which decides the accuracy of a research result.

Mainly, there are two types of sampling techniques:
Probability sampling: It involves random selection which makes every element 
get a chance to be selected. 
Probability sampling has various subtypes in it, as mentioned below:
    Simple Random Sampling - Stratified sampling - Systematic sampling
    Cluster Sampling - Multi-stage Sampling

Non-probability sampling follows non-random selection which means the 
selection is done based on your ease or any other required criteria. 
This helps to collect the data easily. 
Following are various types of sampling:
Convenience Sampling - Purposive Sampling - Quota Sampling - Snowball Sampling
""" 

# Random Sampling - (Python Random Choice Select)

In [None]:
import numpy 
from numpy import random 
from numpy.random import seed, randint

def generate_dataset(num_of_samples, weights):
  """ return a set of samples (phi(x), y) randomly which are classified
  correctly by wieghts """ 
  random.seed(42)
  """ 
  return a single element, a tuple (fn(x), y)
  phi(x) should be a dict of keys 
  that are randomly selected as per random sample 
  weights should be a vector of weights 
  weights = [1, 2, 3, 4, 5, 6, 7, 8 ]
  y should be 1 or 0 as classified by the weight vector
  """ 
  def generate_one_sample():
    phi = None
    y = None

    phi = {}
    for item in random.sample(len(weights)): 
      phi[item] = random.randint(1, len(weights))
    dotProd = 0 
    for i in (0, len(weights) -1):
      #vector_w = list(weights) ; vector_phi = list(phi)
      dotProd += list(weights)[i] * list(phi)[i]
    if dotProd <1: y =0 
    else: y =1 

    return (phi, y)
  return [generate_one_sample() for i in range(num_of_samples)]

weights = {1:0.1, 2:0.1, 3:0.001, 4:0.001, 5:0.03, 6:0.003, 7:0.01, 8:0.001 }
generate_dataset(4, weights)


[({0.3745401188473625: 4,
   0.9507143064099162: 3,
   0.7319939418114051: 6,
   0.5986584841970366: 5,
   0.15601864044243652: 2,
   0.15599452033620265: 4,
   0.05808361216819946: 6,
   0.8661761457749352: 6},
  1),
 ({0.0007787658410143283: 4,
   0.9922115592912175: 4,
   0.6174815096277165: 7,
   0.6116531604882809: 6,
   0.007066305219717406: 6,
   0.023062425041415757: 7,
   0.5247746602583891: 6,
   0.3998609717152555: 3},
  1),
 ({0.38246199126716274: 6,
   0.9832308858067882: 2,
   0.4667628932479799: 2,
   0.8599404067363206: 1,
   0.6803075385877797: 2,
   0.450499251969543: 5,
   0.013264961159866528: 2,
   0.9422017556848528: 4},
  1),
 ({0.24102546602601171: 6,
   0.6832635188254582: 6,
   0.6099966577826209: 6,
   0.8331949117361643: 2,
   0.17336465350777208: 4,
   0.3910606075732408: 6,
   0.18223608778806233: 5,
   0.7553614103176525: 7},
  1)]

In [None]:
import numpy 
from numpy import random 
from numpy.random import seed, randint

def generate_dataset(num_of_samples, weights):
  """ return a set of samples (phi(x), y) randomly which are classified
  correctly by wieghts """ 
  random.seed(42)
  """ 
  return a single element, a tuple (fn(x), y)
  phi(x) should be a dict of keys 
  that are a subset of keys in the weights [requirement]
  weights should be a vector of weights 
  weights = [1, 2, 3, 4, 5, 6, 7, 8 ]
  y should be 1 or 0 as classified by the weight vector
  """ 
  def generate_one_sample():
    phi = None
    y = None

    phi = {}
    for item in range(len(weights)): 
      phi[item] = random.randint(1, len(weights))
    dotProd = 0 
    for i in (0, len(weights) -1):
      vector_w = list(weights) ; vector_phi = list(phi)
      dotProd += vector_w[i] * vector_phi[i]
    if dotProd <1: y =0 
    else: y =1 

    return (phi, y)
  return [generate_one_sample() for i in range(num_of_samples)]

weights = {1:0.1, 2:0.1, 3:0.001, 4:0.001, 5:0.03, 6:0.003, 7:0.01, 8:0.001 }
generate_dataset(4, weights)


[({0: 7, 1: 4, 2: 5, 3: 7, 4: 3, 5: 5, 6: 5, 7: 7}, 1),
 ({0: 2, 1: 3, 2: 7, 3: 3, 4: 3, 5: 5, 6: 4, 7: 3}, 1),
 ({0: 6, 1: 5, 2: 2, 3: 4, 4: 6, 5: 6, 6: 2, 7: 4}, 1),
 ({0: 5, 1: 1, 2: 4, 3: 2, 4: 6, 5: 5, 6: 4, 7: 1}, 1)]

# Utils 
- generateExample, readExample, evaluatePredictor, outputWeights, 
- error analysis, generateCluster, outputCluster

In [None]:
from collections import Counter
def generateClusteringExample(numExamples, numWordsPerTopic, numFilterWords):
  """ generate artifical sentiments inspired by sentiment for clustering
      each review is a hidden sentiment positive or negative for a topic
      the actual sentiment for review consists of 2 sentiment words, 4 topics,
      and 2 filter words 
      example: 
      good:1 great:1 music1:2 music10:1 music4:2 filter0:1 filter10:1 
  """
  sentiments = [['bad', 'awful', 'worst', 'terrible'], ['good', 'great', 'fantastic', 'excellent']]
  topics = ['attitude', 'acting', 'music']

  def generateExample():
    x = Counter()
    # select two sentiment words 
    sentimentWords = [random.choice(sentiment) for sentiment in sentiments ]
    x[random.choice(sentimentWords)] +=1
    x[random.choice(sentimentWords)] +=1
    # select 4 topic words for a fixed topic 
    topic = random.choice(topics)
    x[topic +  str(random.randint(0, numWordsPerTopic-1))] +=1
    x[topic +  str(random.randint(0, numWordsPerTopic-1))] +=1
    x[topic +  str(random.randint(0, numWordsPerTopic-1))] +=1
    x[topic +  str(random.randint(0, numWordsPerTopic-1))] +=1
    #select 2 filter words
    x['filter ' +  str(random.randint(0, numFilterWords-1))] +=1
    return x 

  random.seed(42)
  examples = [generateExample() for _ in range(numExamples) ]
  return examples 

generateClusteringExample(2, 4, 2)

[Counter({'worst': 2, 'music0': 2, 'music2': 1, 'music1': 1, 'filter 0': 1}),
 Counter({'worst': 2,
          'attitude2': 1,
          'attitude1': 2,
          'attitude0': 1,
          'filter 0': 1})]

In [None]:
def dotProduct(d1, d2):
  """
    d1 represents a feature mapping from a feature string to a weight (float32)
    d2 same as d1 
    returns dot product of d1 with d2 
  """
  if len(d1) < len(d2):
    return dotProduct(d2, d1)
  else:
    return sum(d1.get(f,v)*v for f, v in list(d2.items()) )

d1 = {1:3, 2:4, 3:5, 4:6}
d2 = {1:3, 2:4, 3:1, 4:1, 5:0, 6:0}
dotProduct(d1, d2)

36

In [1]:
def increment(d1, scale, d2):
  """
  d1 feature vector that is changed [mutated]
  d2 feature vector
  scale float data type 
  modify d1 
  """
  for f,v in list(d2.items()):
    d1[f] = d1.get(f,v) + v * scale 
d1 = {1:3, 2:4, 3:5, 4:6}
d2 = {1:3, 2:4, 3:1, 4:1, 5:0, 6:0}
increment(d1, 4.5, d2)
d1

{1: 16.5, 2: 22.0, 3: 9.5, 4: 10.5, 5: 0.0, 6: 0.0}

In [None]:
def evaluatePredictor(examples, predictor):
  """
  predictor if a predictor() function that takes x, and returns y 
  given a list of (x,y), returns the fraction of misclassified examples 
  """
  error =0 
  for x, y in examples:
    if predictor(x) !=y: error +=1 
  return 1.0*error/len(examples)

In [None]:
def outputWeights(weights, path):
  """ 
  output to a file the list of weights given or calculated 
  """ 
  out = open(path, 'w', encoding='utf-8')
  #for f,v in sorted(list(weights.items()), key=lambda f_v: =-f_v[1]):
  for f,v in sorted(list(weights.items())):
    print('\t'.join([str(f), str(v)]), file=out)
  out.close()

weights = {2:0.001, 1:0.016, 3:0.004, 6:0.001, 5:0.000, 4:0.001}
path = '/content/weights_2.csv'
outputWeights(weights, path)
!head weights_2.csv

1	0.016
2	0.001
3	0.004
4	0.001
5	0.0
6	0.001
