In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSV file into a pandas dataframe
df = pd.read_csv('/kaggle/input/wineinternship/winedateset.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataframe
print(df.head(5))


In [None]:
# Set the size of the figure
plt.figure(figsize=(8, 6))

# Create the count plot
sns.countplot(x='country', data=df, order=df['country'].value_counts().index[:10])

# Set the title of the plot
plt.title('Top 10 Wine Producing Countries')

# Set the label for the x-axis, its font size, and rotate the tick labels 45 degrees
plt.xlabel('Country', fontsize=12)
plt.xticks(rotation=45)

# Set the label for the y-axis and its font size
plt.ylabel('Number of Wines Produced', fontsize=12)

# Show the plot
plt.show()



In [None]:
# Set the size of the figure
plt.figure(figsize=(10, 6))

# Create the box plot
sns.boxplot(x='price', y='variety', data=df, orient='h')

# Set the title of the plot
plt.title('Price Distribution by Variety')

# Set the label for the x-axis and its font size
plt.xlabel('Price', fontsize=12)

# Set the label for the y-axis and its font size
plt.ylabel('Variety', fontsize=12)

# Show the plot
plt.show()

In [None]:
# Set the size of the figure
plt.figure(figsize=(8, 6))

# Create the count plot
sns.countplot(x='variety', data=df, order=df['variety'].value_counts().index[:10])

# Set the title of the plot
plt.title('Top 10 Wine Varieties')

# Set the label for the x-axis and its font size
plt.xlabel('Variety', fontsize=10)

# Set the label for the y-axis
plt.ylabel('Number of Wines Produced')

# Set the font size of the x-axis tick labels and rotate them 90 degrees
plt.xticks(rotation=90, fontsize=10)

# Set the custom labels for each bar on the x-axis
plt.gca().set_xticklabels(['Cabernet Sauvignon', 'Pinot Noir', 'Chardonnay', 'Red Blend', 'Bordeaux-style Red', 'Sauvignon Blanc', 'Syrah', 'Riesling', 'Merlot', 'Zinfandel'])

# Show the plot
plt.show()

In [None]:
# Create a bar plot of the top 10 wine-producing provinces in the dataset
plt.figure(figsize=(8, 6))
sns.countplot(x='province', data=df, order=df['province'].value_counts().index[:10])

# Add a title and axis labels to the plot
plt.title('Top 10 Wine Producing Provinces')
plt.xlabel('Province')
plt.ylabel('Number of Wines Produced')

# Rotate the x-axis labels to make them more readable
plt.xticks(rotation=60)

# Show the plot
plt.show()

In [None]:
# Print the average, minimum, and maximum wine ratings in the dataset
print('Average Wine Rating:', round(df['points'].mean(), 2))
print('Minimum Wine Rating:', df['points'].min())
print('Maximum Wine Rating:', df['points'].max())

In [None]:
# Print the most expensive and cheapest wine in the dataset
print('Most Expensive Wine:', df['price'].max())
print('Cheapest Wine:', df['price'].min())

In [None]:
# Identify top 10 most reviewed wines
top_10_wines = df.groupby('review_title')['review_description'].count().sort_values(ascending=False).head(10)
print("Top 10 Most Reviewed Wines:\n", top_10_wines)

# Identify countries with the highest rated wines
country_rating = df.groupby('country')['points'].mean().sort_values(ascending=False)
print("\nCountries with the Highest Rated Wines:\n", country_rating)


In [None]:
from collections import Counter
import re

# Define a function to count the occurrences of each word in a column of text
def get_word_counts(column):
    words = []
    for review in column:
        words += re.findall(r'\w+', review.lower())
    return Counter(words)

# Count the most common words in the review_title column
title_counts = get_word_counts(df['review_title'])
print(title_counts.most_common(10))

# Count the most common words in the review_description column
description_counts = get_word_counts(df['review_description'])
print(description_counts.most_common(10))


In [None]:
# scatter plot of points vs. price
plt.scatter(df['points'], df['price'])
plt.xlabel('Points')
plt.ylabel('Price')
plt.show()

# compute the correlation coefficient between points and price
corr = np.corrcoef(df['points'], df['price'])
print(corr)


In [None]:
# Correlation between wine ratings and prices
corr = df[['points', 'price']].corr()
print(corr)

# Identify wines with high ratings but low prices
df.loc[(df['points'] > 90) & (df['price'] < 20)]


In [None]:
# Identify most popular grape varieties by region
popular_grapes = df.groupby(['region', 'variety'])['review_description'].count().sort_values(ascending=False).reset_index()
popular_grapes.groupby('region').head(15)

In [None]:
# importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# loading the dataset
wine_data = pd.read_csv('/kaggle/input/wineinternship/winedateset.csv', encoding='ISO-8859-1')

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(wine_data['review_description'], wine_data['variety'], test_size=0.2, random_state=42)

# vectorizing the text data
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# training the logistic regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# making predictions on the test set
y_pred = model.predict(X_test_vectorized)

# evaluating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('/kaggle/input/wineinternship/winedateset.csv', encoding='ISO-8859-1')

# Select the relevant features for the model
X = df[['country', 'review_title', 'review_description', 'year', 'points', 'price', 'province', 'region', 'winery']]
y = df['variety']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize the text features (review_title, review_description)
vectorizer = TfidfVectorizer()
X_train_title = vectorizer.fit_transform(X_train['review_title'])
X_test_title = vectorizer.transform(X_test['review_title'])
X_train_desc = vectorizer.fit_transform(X_train['review_description'])
X_test_desc = vectorizer.transform(X_test['review_description'])

# Combine the vectorized features with the numerical features (year, points, price)
X_train_final = pd.concat([X_train[['year', 'points', 'price']].reset_index(drop=True), 
                           pd.DataFrame(X_train_title.toarray()), pd.DataFrame(X_train_desc.toarray()), 
                           X_train[['country', 'province', 'region', 'winery']].reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test[['year', 'points', 'price']].reset_index(drop=True), 
                          pd.DataFrame(X_test_title.toarray()), pd.DataFrame(X_test_desc.toarray()), 
                          X_test[['country', 'province', 'region', 'winery']].reset_index(drop=True)], axis=1)

# Train the Random Forest Classifier model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_final, y_train)

# Predict the wine variety for the testing set
y_pred = rfc.predict(X_test_final)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
