In [1]:
# Import dependencies.
import pandas as pd

In [2]:
# Read in data and create dataframe.
vine_data_to_load = "Resources/vine_table.csv"
vine_table_df = pd.read_csv(vine_data_to_load)

In [3]:
# Preview dataframe.
vine_table_df.head(5)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RTIS3L2M1F5SM,5,0,0,N,Y
1,R1ZV7R40OLHKD,5,0,0,N,Y
2,R3BH071QLH8QMC,1,0,1,N,Y
3,R127K9NTSXA2YH,3,0,0,N,Y
4,R32ZWUXDJPW27Q,4,0,0,N,Y


In [4]:
# Retrieve all the rows where total_votes is equal to or greater than 20 to pick reviews
# that are more likely to be helpful.
total_votes_df = vine_table_df[(vine_table_df["total_votes"] >= 20)]

In [5]:
# Review dataframe.
total_votes_df.head(5)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
289,R3EZ0EPYLDA34S,1,14,31,N,Y
483,R2FJ94555FZH32,2,55,60,N,N


In [6]:
# Retrieve all the rows where the number of helpful_votes divided by total_votes is
# equal or greater than 50%.
helpful_votes_df = total_votes_df[((total_votes_df["helpful_votes"] / total_votes_df["total_votes"]) >= 0.5)]

In [7]:
# Review dataframe.
helpful_votes_df.head(5)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
483,R2FJ94555FZH32,2,55,60,N,N
537,R1U3AR67RE273L,1,51,65,N,Y


In [8]:
# Retrieve all the rows where a review was written as part of the Vine program (paid).
vine_review_yes_df = helpful_votes_df[(helpful_votes_df["vine"] == 'Y')]

In [9]:
# Review dataframe.
vine_review_yes_df.head(5)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
32611,R3KKUSGFZWSUIY,5,56,63,Y,N
33112,R10FO5UKKVZBK2,3,23,23,Y,N
69680,RM4KSGEOR7MU1,5,19,24,Y,N
155361,RG7VRMYLEXD23,4,22,26,Y,N
239327,R11O4YSCPSNL6L,3,20,26,Y,N


In [10]:
# Retrieve all the rows where a review was not written as part of the Vine program.
vine_review_no_df = helpful_votes_df[(helpful_votes_df["vine"] == 'N')]

In [11]:
# Review dataframe
vine_review_no_df.head(5)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
483,R2FJ94555FZH32,2,55,60,N,N
537,R1U3AR67RE273L,1,51,65,N,Y


## Determine the total number of reviews, the number of 5-star reviews, and the percentage of 5-star reviews for the two types of review (paid vs unpaid).

In [12]:
# For the Vine program:
vine_program_count = vine_review_yes_df.count()["review_id"]
vine_program_count

94

In [13]:
vine_program_5star_reviews = vine_review_yes_df[vine_review_yes_df["star_rating"] == 5].count()["star_rating"]
vine_program_5star_reviews

48

In [14]:
# For outside the Vine program:
outside_program_count = vine_review_no_df.count()["review_id"]
outside_program_count

40471

In [15]:
outside_program_5star_reviews = vine_review_no_df[vine_review_no_df["star_rating"] == 5].count()["star_rating"]
outside_program_5star_reviews

15663

In [16]:
# Determine total number of 5-star reviews.
total_num_5star = vine_program_5star_reviews + outside_program_5star_reviews
total_num_5star

15711

In [17]:
# The percentages.
total_num_vine = (vine_program_5star_reviews / total_num_5star) * 100
total_num_vine

0.30551842658010314

In [18]:
total_num_outside = (outside_program_5star_reviews / total_num_5star) * 100
total_num_outside

99.6944815734199