In [None]:
pip install pandas openpyxl

In [None]:
import pandas as pd
import os
print(os.getcwd())
import os
import re

In [None]:
#import os
#import pandas as pd


# Define a function to remove illegal characters
def remove_illegal_characters(s):
    illegal_char_pattern = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    if isinstance(s, str):
        return illegal_char_pattern.sub("", s)
    else:
        return s

file_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'March032024Opportunities.csv')

try:
    df = pd.read_csv(file_path, encoding='latin-1')
    print("CSV file read successfully!")
except FileNotFoundError:
    print("Error: File not found. Please check the following:")
    print(f"- File path accuracy: {file_path}")
    print(f"- File existence: Check if the file exists at the specified path.")
    print(f"- Current working directory: {os.getcwd()}")
    print(f"- File permissions: Verify your script has read access to the file.")
    print(f"- File encoding: If the file has a different encoding, specify it using the 'encoding' argument in pd.read_csv().")
else:
    keywords = [
        "project management", "data analytics", "artificial intelligence", "dev ops", "product management",
        "project support", "technical services", "administrative services", "solicitation", "set aside", "women-owned",
        "small business",
    ]

    def contains_any_keyword(description):
        description_str = str(description).lower()
        return any(keyword in description_str for keyword in keywords)

    columns_of_interest = [
        "Title", "Description", "Current Response Date", "Contract Opportunity Type", "Set Aside", "POC Information",
    ]

    # Define df_filtered here to ensure it's always defined before use
    df_filtered = df.copy()

    missing_columns = [col for col in columns_of_interest if col not in df.columns]
    if missing_columns:
        print(f"Warning: Missing columns in DataFrame - {missing_columns}")
        df_filtered = df_filtered.drop(columns=missing_columns, errors='ignore')
    else:
        df_filtered = df_filtered[columns_of_interest]

    for col in df_filtered.select_dtypes(include=['object']).columns:
        df_filtered.loc[:, col] = df_filtered.loc[:, col].apply(remove_illegal_characters)

    filtered_df = df_filtered[df_filtered['Description'].apply(contains_any_keyword)]
    print(filtered_df['Title'])

    # Now try saving the DataFrame to Excel again
    filtered_df.to_excel('Marchfiltered_Titles_with_keywords.xlsx', index=False)


In [None]:
pip install wordcloud


In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and you already have the 'contains_any_keyword' function defined

# Step 1: Count keyword occurrences
# This dictionary will hold our keyword frequencies
keyword_frequencies = {keyword: 0 for keyword in keywords}

for description in df['Description'].dropna():
    description_str = description.lower()
    for keyword in keywords:
        if keyword in description_str:
            keyword_frequencies[keyword] += 1

# Step 2: Visualization
# Bar Chart
plt.figure(figsize=(10, 8))
plt.bar(keyword_frequencies.keys(), keyword_frequencies.values(), color='skyblue')
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha="right")
plt.title('Keyword Frequency in Opportunities')
plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
plt.show()

# Word Cloud
wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate_from_frequencies(keyword_frequencies)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Remove axis for better visualization
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline

# Sample DataFrame
data = {
    "Description": [
        "A project management opportunity...",
        "Analyze data trends...",
        "...AI to make improvements...","...AI to tag data...",
        "Streamline CI/CD pipelines..."
    ],
    "Set Aside": [1, 0, 1, 0,1]  # 1 for small business set-aside, 0 otherwise
}
df = pd.DataFrame(data)

# Split data into features and target
X = df['Description']
y = df['Set Aside']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a machine learning pipeline that includes TF-IDF vectorization and a logistic regression model
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, conf_matrix


In [None]:
#Step 1: Preprocess the Data and Feature Engineering

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline

# Assuming 'df' is your DataFrame with 'Description' and 'Set Aside' columns ready


In [None]:
#Split Your Data into Training and Testing Sets

# Split data into features (X) and target variable (y)
X = df['Description']  # Features
y = df['Set Aside']    # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# CREATE AND TRAINI THE MODEL 

# Create a machine learning pipeline
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
pipeline.fit(X_train, y_train)



In [None]:
# EVALUATE THE MODEL 

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

# You can also print the confusion matrix to see how well the model is performing
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")


In [None]:
# MAKE PREDICTIONS ON TEH NEW DATA 

# Example of predicting a new description
new_descriptions = [
    "An opportunity for small businesses to engage in IT development projects.",
    "Large scale construction project requiring significant capital investment."
]
predictions = pipeline.predict(new_descriptions)

print(predictions)


In [None]:
# Summarize mdoe performance

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Dummy metrics for illustration
accuracy = 0.85
precision = 0.80
recall = 0.75

# Confusion Matrix
conf_matrix = [[50, 10], [15, 25]]

# Summarize metrics in a string
summary = f"""
Model Performance Summary:
- Accuracy: {accuracy * 100:.2f}%
- Precision: {precision * 100:.2f}%
- Recall: {recall * 100:.2f}%
"""

print(summary)


In [None]:
# visualize 

sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
